{ "cells": [ { "cell_type": "markdown", "id": "aec20cd0", "metadata": {}, "source": [ "# Model - Multiple Vectorizers" ] }, { "cell_type": "markdown", "id": "f1a813d7", "metadata": {}, "source": [ "### Imports" ] }, { "cell_type": "code", "execution_count": 3, "id": "8eb77ef5", "metadata": {}, "outputs": [], "source": [ "import os\n", "import pandas as pd\n", "from iqual import iqualnlp, evaluation, crossval, vectorizers" ] }, { "cell_type": "markdown", "id": "17a42bc8", "metadata": {}, "source": [ "### Load `annotated (human-coded)` and `unannotated` datasets" ] }, { "cell_type": "code", "execution_count": 4, "id": "a7d035ab", "metadata": {}, "outputs": [], "source": [ "data_dir = \"../../data\"\n", "human_coded_df = pd.read_csv(os.path.join(data_dir,\"annotated.csv\"))\n", "uncoded_df = pd.read_csv(os.path.join(data_dir,\"unannotated.csv\"))" ] }, { "cell_type": "markdown", "id": "db9ac6dd", "metadata": {}, "source": [ "### Load `spacy` or `sentence-transformers` using precomputed dictionaries" ] }, { "cell_type": "code", "execution_count": 5, "id": "8eacd7b8", "metadata": {}, "outputs": [], "source": [ "dict_dir = \"../../dictionaries\"\n", "\n", "sentence_transformer_models = [\"all-mpnet-base-v2\", \"all-roberta-large-v1\",\"distiluse-base-multilingual-cased-v2\"]\n", "spacy_models = [\"en_core_web_sm\",\"en_core_web_md\",\"en_core_web_lg\"]\n", "model_paths = [os.path.join(dict_dir,m+'.pkl') for m in [*sentence_transformer_models,*spacy_models]]" ] }, { "cell_type": "markdown", "id": "9b308f43", "metadata": {}, "source": [ "### Split the data into training and test sets" ] }, { "cell_type": "code", "execution_count": 6, "id": "e0f95233", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train Size: 7470\n", "Test Size: 2490\n" ] } ], "source": [ "from sklearn.model_selection import train_test_split\n", "train_df, test_df = train_test_split(human_coded_df,test_size=0.25)\n", "print(f\"Train Size: {len(train_df)}\\nTest Size: {len(test_df)}\")" ] }, { "cell_type": "markdown", "id": "f0ea3ad2", "metadata": {}, "source": [ "### Configure training data" ] }, { "cell_type": "code", "execution_count": 7, "id": "ebffbf2c", "metadata": {}, "outputs": [], "source": [ "### Select Question and Answer Columns\n", "question_col = 'Q_en'\n", "answer_col = 'A_en'\n", "\n", "### Select a code\n", "code_variable = 'marriage'\n", "\n", "### Create X and y\n", "X = train_df[[question_col,answer_col]]\n", "y = train_df[code_variable]" ] }, { "cell_type": "markdown", "id": "23910543", "metadata": {}, "source": [ "### Initiate model" ] }, { "cell_type": "code", "execution_count": 11, "id": "ea7cd53b", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Pipeline(steps=[('Input',\n", " FeatureUnion(transformer_list=[('question',\n", " Pipeline(steps=[('selector',\n", " FunctionTransformer(func=<function column_selector at 0x0000023B71C74280>,\n", " kw_args={'column_name': 'Q_en'})),\n", " ('vectorizer',\n", " Vectorizer(env='saved-dictionary',\n", " model='../dictionaries/all-mpnet-base-v2.pkl'))])),\n", " ('answer',\n", " Pipeline(steps=[('selector',\n", " FunctionTrans...\n", " model='../dictionaries/all-mpnet-base-v2.pkl'))]))])),\n", " ('Classifier',\n", " Classifier(C=1.0, class_weight=None, dual=False,\n", " fit_intercept=True, intercept_scaling=1,\n", " l1_ratio=None, max_iter=100,\n", " model='LogisticRegression', multi_class='auto',\n", " n_jobs=None, penalty='l2', random_state=None,\n", " solver='lbfgs', tol=0.0001, verbose=0,\n", " warm_start=False)),\n", " ('Threshold', BinaryThresholder())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('Input',\n", " FeatureUnion(transformer_list=[('question',\n", " Pipeline(steps=[('selector',\n", " FunctionTransformer(func=<function column_selector at 0x0000023B71C74280>,\n", " kw_args={'column_name': 'Q_en'})),\n", " ('vectorizer',\n", " Vectorizer(env='saved-dictionary',\n", " model='../dictionaries/all-mpnet-base-v2.pkl'))])),\n", " ('answer',\n", " Pipeline(steps=[('selector',\n", " FunctionTrans...\n", " model='../dictionaries/all-mpnet-base-v2.pkl'))]))])),\n", " ('Classifier',\n", " Classifier(C=1.0, class_weight=None, dual=False,\n", " fit_intercept=True, intercept_scaling=1,\n", " l1_ratio=None, max_iter=100,\n", " model='LogisticRegression', multi_class='auto',\n", " n_jobs=None, penalty='l2', random_state=None,\n", " solver='lbfgs', tol=0.0001, verbose=0,\n", " warm_start=False)),\n", " ('Threshold', BinaryThresholder())])
FeatureUnion(transformer_list=[('question',\n", " Pipeline(steps=[('selector',\n", " FunctionTransformer(func=<function column_selector at 0x0000023B71C74280>,\n", " kw_args={'column_name': 'Q_en'})),\n", " ('vectorizer',\n", " Vectorizer(env='saved-dictionary',\n", " model='../dictionaries/all-mpnet-base-v2.pkl'))])),\n", " ('answer',\n", " Pipeline(steps=[('selector',\n", " FunctionTransformer(func=<function column_selector at 0x0000023B71C74280>,\n", " kw_args={'column_name': 'A_en'})),\n", " ('vectorizer',\n", " Vectorizer(env='saved-dictionary',\n", " model='../dictionaries/all-mpnet-base-v2.pkl'))]))])
FunctionTransformer(func=<function column_selector at 0x0000023B71C74280>,\n", " kw_args={'column_name': 'Q_en'})
Vectorizer(env='saved-dictionary',\n", " model='../dictionaries/all-mpnet-base-v2.pkl')
FunctionTransformer(func=<function column_selector at 0x0000023B71C74280>,\n", " kw_args={'column_name': 'A_en'})
Vectorizer(env='saved-dictionary',\n", " model='../dictionaries/all-mpnet-base-v2.pkl')
Classifier(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", " intercept_scaling=1, l1_ratio=None, max_iter=100,\n", " model='LogisticRegression', multi_class='auto', n_jobs=None,\n", " penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,\n", " verbose=0, warm_start=False)
BinaryThresholder()
Pipeline(steps=[('Input',\n", " FeatureUnion(transformer_list=[('question',\n", " Pipeline(steps=[('selector',\n", " FunctionTransformer(func=<function column_selector at 0x0000023B71C74280>,\n", " kw_args={'column_name': 'Q_en'})),\n", " ('vectorizer',\n", " Vectorizer(env='saved-dictionary',\n", " model='../../dictionaries\\\\distiluse-base-multilingual-cased-v2.pkl'))])),\n", " ('answer',\n", " Pipeline(steps=[('...\n", " Classifier(C=0.1, class_weight=None, dual=False,\n", " fit_intercept=True, intercept_scaling=1,\n", " l1_ratio=None, max_iter=100,\n", " model='LogisticRegression', multi_class='auto',\n", " n_jobs=None, penalty='l2', random_state=None,\n", " solver='lbfgs', tol=0.0001, verbose=0,\n", " warm_start=False)),\n", " ('Threshold',\n", " BinaryThresholder(threshold=0.1919019469710422,\n", " threshold_range=(0.004475209108360682,\n", " 0.8112250807781638)))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('Input',\n", " FeatureUnion(transformer_list=[('question',\n", " Pipeline(steps=[('selector',\n", " FunctionTransformer(func=<function column_selector at 0x0000023B71C74280>,\n", " kw_args={'column_name': 'Q_en'})),\n", " ('vectorizer',\n", " Vectorizer(env='saved-dictionary',\n", " model='../../dictionaries\\\\distiluse-base-multilingual-cased-v2.pkl'))])),\n", " ('answer',\n", " Pipeline(steps=[('...\n", " Classifier(C=0.1, class_weight=None, dual=False,\n", " fit_intercept=True, intercept_scaling=1,\n", " l1_ratio=None, max_iter=100,\n", " model='LogisticRegression', multi_class='auto',\n", " n_jobs=None, penalty='l2', random_state=None,\n", " solver='lbfgs', tol=0.0001, verbose=0,\n", " warm_start=False)),\n", " ('Threshold',\n", " BinaryThresholder(threshold=0.1919019469710422,\n", " threshold_range=(0.004475209108360682,\n", " 0.8112250807781638)))])
FeatureUnion(transformer_list=[('question',\n", " Pipeline(steps=[('selector',\n", " FunctionTransformer(func=<function column_selector at 0x0000023B71C74280>,\n", " kw_args={'column_name': 'Q_en'})),\n", " ('vectorizer',\n", " Vectorizer(env='saved-dictionary',\n", " model='../../dictionaries\\\\distiluse-base-multilingual-cased-v2.pkl'))])),\n", " ('answer',\n", " Pipeline(steps=[('selector',\n", " FunctionTransformer(func=<function column_selector at 0x0000023B71C74280>,\n", " kw_args={'column_name': 'A_en'})),\n", " ('vectorizer',\n", " Vectorizer(env='saved-dictionary',\n", " model='../../dictionaries\\\\distiluse-base-multilingual-cased-v2.pkl'))]))])
FunctionTransformer(func=<function column_selector at 0x0000023B71C74280>,\n", " kw_args={'column_name': 'Q_en'})
Vectorizer(env='saved-dictionary',\n", " model='../../dictionaries\\\\distiluse-base-multilingual-cased-v2.pkl')
FunctionTransformer(func=<function column_selector at 0x0000023B71C74280>,\n", " kw_args={'column_name': 'A_en'})
Vectorizer(env='saved-dictionary',\n", " model='../../dictionaries\\\\distiluse-base-multilingual-cased-v2.pkl')
Classifier(C=0.1, class_weight=None, dual=False, fit_intercept=True,\n", " intercept_scaling=1, l1_ratio=None, max_iter=100,\n", " model='LogisticRegression', multi_class='auto', n_jobs=None,\n", " penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,\n", " verbose=0, warm_start=False)
BinaryThresholder(threshold=0.1919019469710422,\n", " threshold_range=(0.004475209108360682, 0.8112250807781638))