{ "cells": [ { "cell_type": "markdown", "id": "de8b7f60", "metadata": {}, "source": [ "# Model - Scikit-Learn" ] }, { "cell_type": "markdown", "id": "f1a813d7", "metadata": {}, "source": [ "### Imports" ] }, { "cell_type": "code", "execution_count": 1, "id": "25a735ed", "metadata": {}, "outputs": [], "source": [ "import os\n", "import pandas as pd\n", "from iqual import iqualnlp, evaluation, crossval" ] }, { "cell_type": "markdown", "id": "17a42bc8", "metadata": {}, "source": [ "### Load `annotated (human-coded)` and `unannotated` datasets" ] }, { "cell_type": "code", "execution_count": 2, "id": "a7d035ab", "metadata": {}, "outputs": [], "source": [ "data_dir = \"../../data\"\n", "human_coded_df = pd.read_csv(os.path.join(data_dir,\"annotated.csv\"))\n", "uncoded_df = pd.read_csv(os.path.join(data_dir,\"unannotated.csv\"))" ] }, { "cell_type": "markdown", "id": "9b308f43", "metadata": {}, "source": [ "### Split the data into training and test sets" ] }, { "cell_type": "code", "execution_count": 3, "id": "e0f95233", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train Size: 7470\n", "Test Size: 2490\n" ] } ], "source": [ "from sklearn.model_selection import train_test_split\n", "train_df, test_df = train_test_split(human_coded_df,test_size=0.25)\n", "print(f\"Train Size: {len(train_df)}\\nTest Size: {len(test_df)}\")" ] }, { "cell_type": "markdown", "id": "f0ea3ad2", "metadata": {}, "source": [ "### Configure training data" ] }, { "cell_type": "code", "execution_count": 4, "id": "ebffbf2c", "metadata": {}, "outputs": [], "source": [ "### Select Question and Answer Columns\n", "question_col = 'Q_en'\n", "answer_col = 'A_en'\n", "\n", "### Select a code\n", "code_variable = 'marriage'\n", "\n", "### Create X and y\n", "X = train_df[[question_col,answer_col]]\n", "y = train_df[code_variable]" ] }, { "cell_type": "markdown", "id": "23910543", "metadata": {}, "source": [ "### Initiate model" ] }, { "cell_type": "code", "execution_count": 5, "id": "ecd4446c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Pipeline(steps=[('Input',\n", " FeatureUnion(transformer_list=[('question',\n", " Pipeline(steps=[('selector',\n", " FunctionTransformer(func=<function column_selector at 0x000002B49C2C9820>,\n", " kw_args={'column_name': 'Q_en'})),\n", " ('vectorizer',\n", " Vectorizer(analyzer='word',\n", " binary=False,\n", " decode_error='strict',\n", " dtype=<class 'numpy.float64'>,\n", " encoding='utf-8',\n", " env='scikit-learn',\n", " input='co...\n", " tokenizer=None,\n", " use_idf=True,\n", " vocabulary=None))]))])),\n", " ('Classifier',\n", " Classifier(C=1.0, class_weight=None, dual=False,\n", " fit_intercept=True, intercept_scaling=1,\n", " l1_ratio=None, max_iter=100,\n", " model='LogisticRegression', multi_class='auto',\n", " n_jobs=None, penalty='l2', random_state=None,\n", " solver='lbfgs', tol=0.0001, verbose=0,\n", " warm_start=False)),\n", " ('Threshold', BinaryThresholder())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('Input',\n", " FeatureUnion(transformer_list=[('question',\n", " Pipeline(steps=[('selector',\n", " FunctionTransformer(func=<function column_selector at 0x000002B49C2C9820>,\n", " kw_args={'column_name': 'Q_en'})),\n", " ('vectorizer',\n", " Vectorizer(analyzer='word',\n", " binary=False,\n", " decode_error='strict',\n", " dtype=<class 'numpy.float64'>,\n", " encoding='utf-8',\n", " env='scikit-learn',\n", " input='co...\n", " tokenizer=None,\n", " use_idf=True,\n", " vocabulary=None))]))])),\n", " ('Classifier',\n", " Classifier(C=1.0, class_weight=None, dual=False,\n", " fit_intercept=True, intercept_scaling=1,\n", " l1_ratio=None, max_iter=100,\n", " model='LogisticRegression', multi_class='auto',\n", " n_jobs=None, penalty='l2', random_state=None,\n", " solver='lbfgs', tol=0.0001, verbose=0,\n", " warm_start=False)),\n", " ('Threshold', BinaryThresholder())])
FeatureUnion(transformer_list=[('question',\n", " Pipeline(steps=[('selector',\n", " FunctionTransformer(func=<function column_selector at 0x000002B49C2C9820>,\n", " kw_args={'column_name': 'Q_en'})),\n", " ('vectorizer',\n", " Vectorizer(analyzer='word',\n", " binary=False,\n", " decode_error='strict',\n", " dtype=<class 'numpy.float64'>,\n", " encoding='utf-8',\n", " env='scikit-learn',\n", " input='content',\n", " lowercase=True,\n", " max...\n", " dtype=<class 'numpy.float64'>,\n", " encoding='utf-8',\n", " env='scikit-learn',\n", " input='content',\n", " lowercase=True,\n", " max_df=1.0,\n", " max_features=None,\n", " min_df=1,\n", " model='TfidfVectorizer',\n", " ngram_range=(1, 1),\n", " norm='l2',\n", " preprocessor=None,\n", " smooth_idf=True,\n", " stop_words=None,\n", " strip_accents=None,\n", " sublinear_tf=False,\n", " token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n", " tokenizer=None,\n", " use_idf=True,\n", " vocabulary=None))]))])
FunctionTransformer(func=<function column_selector at 0x000002B49C2C9820>,\n", " kw_args={'column_name': 'Q_en'})
Vectorizer(analyzer='word', binary=False, decode_error='strict',\n", " dtype=<class 'numpy.float64'>, encoding='utf-8', env='scikit-learn',\n", " input='content', lowercase=True, max_df=1.0, max_features=None,\n", " min_df=1, model='TfidfVectorizer', ngram_range=(1, 1), norm='l2',\n", " preprocessor=None, smooth_idf=True, stop_words=None,\n", " strip_accents=None, sublinear_tf=False,\n", " token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b', tokenizer=None, use_idf=True,\n", " vocabulary=None)
FunctionTransformer(func=<function column_selector at 0x000002B49C2C9820>,\n", " kw_args={'column_name': 'A_en'})
Vectorizer(analyzer='word', binary=False, decode_error='strict',\n", " dtype=<class 'numpy.float64'>, encoding='utf-8', env='scikit-learn',\n", " input='content', lowercase=True, max_df=1.0, max_features=None,\n", " min_df=1, model='TfidfVectorizer', ngram_range=(1, 1), norm='l2',\n", " preprocessor=None, smooth_idf=True, stop_words=None,\n", " strip_accents=None, sublinear_tf=False,\n", " token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b', tokenizer=None, use_idf=True,\n", " vocabulary=None)
Classifier(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", " intercept_scaling=1, l1_ratio=None, max_iter=100,\n", " model='LogisticRegression', multi_class='auto', n_jobs=None,\n", " penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,\n", " verbose=0, warm_start=False)
BinaryThresholder()