{ "cells": [ { "cell_type": "markdown", "id": "486801ce", "metadata": {}, "source": [ "# Model - SpaCy" ] }, { "cell_type": "markdown", "id": "f1a813d7", "metadata": {}, "source": [ "### Imports" ] }, { "cell_type": "code", "execution_count": 1, "id": "8eb77ef5", "metadata": {}, "outputs": [], "source": [ "import os\n", "import pandas as pd\n", "from iqual import iqualnlp, evaluation, crossval" ] }, { "cell_type": "markdown", "id": "17a42bc8", "metadata": {}, "source": [ "### Load `annotated (human-coded)` and `unannotated` datasets" ] }, { "cell_type": "code", "execution_count": 2, "id": "a7d035ab", "metadata": {}, "outputs": [], "source": [ "data_dir = \"../../data\"\n", "\n", "human_coded_df = pd.read_csv(os.path.join(data_dir,\"annotated.csv\"))\n", "uncoded_df = pd.read_csv(os.path.join(data_dir,\"unannotated.csv\"))" ] }, { "cell_type": "markdown", "id": "9b308f43", "metadata": {}, "source": [ "### Split the data into training and test sets" ] }, { "cell_type": "code", "execution_count": 3, "id": "e0f95233", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train Size: 7470\n", "Test Size: 2490\n" ] } ], "source": [ "from sklearn.model_selection import train_test_split\n", "train_df, test_df = train_test_split(human_coded_df,test_size=0.25)\n", "print(f\"Train Size: {len(train_df)}\\nTest Size: {len(test_df)}\")" ] }, { "cell_type": "markdown", "id": "f0ea3ad2", "metadata": {}, "source": [ "### Configure training data" ] }, { "cell_type": "code", "execution_count": 4, "id": "ebffbf2c", "metadata": {}, "outputs": [], "source": [ "### Select Question and Answer Columns\n", "question_col = 'Q_en'\n", "answer_col = 'A_en'\n", "\n", "### Select a code\n", "code_variable = 'marriage'\n", "\n", "### Create X and y\n", "X = train_df[[question_col,answer_col]]\n", "y = train_df[code_variable]" ] }, { "cell_type": "raw", "id": "fa7f9df4", "metadata": {}, "source": [ "# NOTE: Make sure to download spacy language models before using them.\n", "\n", "# English - Small\n", "!python -m spacy download en_core_web_sm \n", "\n", "# English - Medium\n", "!python -m spacy download en_core_web_md\n", "\n", "# English - Large\n", "!python -m spacy download en_core_web_lg" ] }, { "cell_type": "markdown", "id": "23910543", "metadata": {}, "source": [ "### Initiate model" ] }, { "cell_type": "code", "execution_count": 5, "id": "ea7cd53b", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Pipeline(steps=[('Input',\n", " FeatureUnion(transformer_list=[('question',\n", " Pipeline(steps=[('selector',\n", " FunctionTransformer(func=<function column_selector at 0x00000274D4B7A8B0>,\n", " kw_args={'column_name': 'Q_en'})),\n", " ('vectorizer',\n", " Vectorizer(env='spacy',\n", " model='en_core_web_sm'))])),\n", " ('answer',\n", " Pipeline(steps=[('selector',\n", " FunctionTransformer(func=<function column_select...\n", " ('FeatureTransformation', FeatureScaler(copy=True, norm='l2')),\n", " ('Classifier',\n", " Classifier(C=1.0, class_weight=None, dual=False,\n", " fit_intercept=True, intercept_scaling=1,\n", " l1_ratio=None, max_iter=100,\n", " model='LogisticRegression', multi_class='auto',\n", " n_jobs=None, penalty='l2', random_state=None,\n", " solver='lbfgs', tol=0.0001, verbose=0,\n", " warm_start=False)),\n", " ('Threshold', BinaryThresholder())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('Input',\n", " FeatureUnion(transformer_list=[('question',\n", " Pipeline(steps=[('selector',\n", " FunctionTransformer(func=<function column_selector at 0x00000274D4B7A8B0>,\n", " kw_args={'column_name': 'Q_en'})),\n", " ('vectorizer',\n", " Vectorizer(env='spacy',\n", " model='en_core_web_sm'))])),\n", " ('answer',\n", " Pipeline(steps=[('selector',\n", " FunctionTransformer(func=<function column_select...\n", " ('FeatureTransformation', FeatureScaler(copy=True, norm='l2')),\n", " ('Classifier',\n", " Classifier(C=1.0, class_weight=None, dual=False,\n", " fit_intercept=True, intercept_scaling=1,\n", " l1_ratio=None, max_iter=100,\n", " model='LogisticRegression', multi_class='auto',\n", " n_jobs=None, penalty='l2', random_state=None,\n", " solver='lbfgs', tol=0.0001, verbose=0,\n", " warm_start=False)),\n", " ('Threshold', BinaryThresholder())])
FeatureUnion(transformer_list=[('question',\n", " Pipeline(steps=[('selector',\n", " FunctionTransformer(func=<function column_selector at 0x00000274D4B7A8B0>,\n", " kw_args={'column_name': 'Q_en'})),\n", " ('vectorizer',\n", " Vectorizer(env='spacy',\n", " model='en_core_web_sm'))])),\n", " ('answer',\n", " Pipeline(steps=[('selector',\n", " FunctionTransformer(func=<function column_selector at 0x00000274D4B7A8B0>,\n", " kw_args={'column_name': 'A_en'})),\n", " ('vectorizer',\n", " Vectorizer(env='spacy',\n", " model='en_core_web_sm'))]))])
FunctionTransformer(func=<function column_selector at 0x00000274D4B7A8B0>,\n", " kw_args={'column_name': 'Q_en'})
Vectorizer(env='spacy', model='en_core_web_sm')
FunctionTransformer(func=<function column_selector at 0x00000274D4B7A8B0>,\n", " kw_args={'column_name': 'A_en'})
Vectorizer(env='spacy', model='en_core_web_sm')
FeatureScaler(copy=True, norm='l2')
Classifier(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", " intercept_scaling=1, l1_ratio=None, max_iter=100,\n", " model='LogisticRegression', multi_class='auto', n_jobs=None,\n", " penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,\n", " verbose=0, warm_start=False)
BinaryThresholder()