{ "cells": [ { "cell_type": "markdown", "id": "aec20cd0", "metadata": {}, "source": [ "# Model - Multiple Vectorizers" ] }, { "cell_type": "markdown", "id": "f1a813d7", "metadata": {}, "source": [ "### Imports" ] }, { "cell_type": "code", "execution_count": 3, "id": "8eb77ef5", "metadata": {}, "outputs": [], "source": [ "import os\n", "import pandas as pd\n", "from iqual import iqualnlp, evaluation, crossval, vectorizers" ] }, { "cell_type": "markdown", "id": "17a42bc8", "metadata": {}, "source": [ "### Load `annotated (human-coded)` and `unannotated` datasets" ] }, { "cell_type": "code", "execution_count": 4, "id": "a7d035ab", "metadata": {}, "outputs": [], "source": [ "data_dir = \"../../data\"\n", "human_coded_df = pd.read_csv(os.path.join(data_dir,\"annotated.csv\"))\n", "uncoded_df = pd.read_csv(os.path.join(data_dir,\"unannotated.csv\"))" ] }, { "cell_type": "markdown", "id": "db9ac6dd", "metadata": {}, "source": [ "### Load `spacy` or `sentence-transformers` using precomputed dictionaries" ] }, { "cell_type": "code", "execution_count": 5, "id": "8eacd7b8", "metadata": {}, "outputs": [], "source": [ "dict_dir = \"../../dictionaries\"\n", "\n", "sentence_transformer_models = [\"all-mpnet-base-v2\", \"all-roberta-large-v1\",\"distiluse-base-multilingual-cased-v2\"]\n", "spacy_models = [\"en_core_web_sm\",\"en_core_web_md\",\"en_core_web_lg\"]\n", "model_paths = [os.path.join(dict_dir,m+'.pkl') for m in [*sentence_transformer_models,*spacy_models]]" ] }, { "cell_type": "markdown", "id": "9b308f43", "metadata": {}, "source": [ "### Split the data into training and test sets" ] }, { "cell_type": "code", "execution_count": 6, "id": "e0f95233", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train Size: 7470\n", "Test Size: 2490\n" ] } ], "source": [ "from sklearn.model_selection import train_test_split\n", "train_df, test_df = train_test_split(human_coded_df,test_size=0.25)\n", "print(f\"Train Size: {len(train_df)}\\nTest Size: {len(test_df)}\")" ] }, { "cell_type": "markdown", "id": "f0ea3ad2", "metadata": {}, "source": [ "### Configure training data" ] }, { "cell_type": "code", "execution_count": 7, "id": "ebffbf2c", "metadata": {}, "outputs": [], "source": [ "### Select Question and Answer Columns\n", "question_col = 'Q_en'\n", "answer_col = 'A_en'\n", "\n", "### Select a code\n", "code_variable = 'marriage'\n", "\n", "### Create X and y\n", "X = train_df[[question_col,answer_col]]\n", "y = train_df[code_variable]" ] }, { "cell_type": "markdown", "id": "23910543", "metadata": {}, "source": [ "### Initiate model" ] }, { "cell_type": "code", "execution_count": 11, "id": "ea7cd53b", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Pipeline(steps=[('Input',\n",
       "                 FeatureUnion(transformer_list=[('question',\n",
       "                                                 Pipeline(steps=[('selector',\n",
       "                                                                  FunctionTransformer(func=<function column_selector at 0x0000023B71C74280>,\n",
       "                                                                                      kw_args={'column_name': 'Q_en'})),\n",
       "                                                                 ('vectorizer',\n",
       "                                                                  Vectorizer(env='saved-dictionary',\n",
       "                                                                             model='../dictionaries/all-mpnet-base-v2.pkl'))])),\n",
       "                                                ('answer',\n",
       "                                                 Pipeline(steps=[('selector',\n",
       "                                                                  FunctionTrans...\n",
       "                                                                             model='../dictionaries/all-mpnet-base-v2.pkl'))]))])),\n",
       "                ('Classifier',\n",
       "                 Classifier(C=1.0, class_weight=None, dual=False,\n",
       "                            fit_intercept=True, intercept_scaling=1,\n",
       "                            l1_ratio=None, max_iter=100,\n",
       "                            model='LogisticRegression', multi_class='auto',\n",
       "                            n_jobs=None, penalty='l2', random_state=None,\n",
       "                            solver='lbfgs', tol=0.0001, verbose=0,\n",
       "                            warm_start=False)),\n",
       "                ('Threshold', BinaryThresholder())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "Pipeline(steps=[('Input',\n", " FeatureUnion(transformer_list=[('question',\n", " Pipeline(steps=[('selector',\n", " FunctionTransformer(func=,\n", " kw_args={'column_name': 'Q_en'})),\n", " ('vectorizer',\n", " Vectorizer(env='saved-dictionary',\n", " model='../dictionaries/all-mpnet-base-v2.pkl'))])),\n", " ('answer',\n", " Pipeline(steps=[('selector',\n", " FunctionTrans...\n", " model='../dictionaries/all-mpnet-base-v2.pkl'))]))])),\n", " ('Classifier',\n", " Classifier(C=1.0, class_weight=None, dual=False,\n", " fit_intercept=True, intercept_scaling=1,\n", " l1_ratio=None, max_iter=100,\n", " model='LogisticRegression', multi_class='auto',\n", " n_jobs=None, penalty='l2', random_state=None,\n", " solver='lbfgs', tol=0.0001, verbose=0,\n", " warm_start=False)),\n", " ('Threshold', BinaryThresholder())])" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Step 1: Initiate the model class\n", "iqual_model = iqualnlp.Model()\n", "\n", "# Step 2: Add layers to the model\n", "# Add text columns, and choose a feature extraction model (Available options: scikit-learn, spacy, sentence-transformers, saved-dictionary (picklized dictionary))\n", "iqual_model.add_text_features(question_col,answer_col,model='../dictionaries/all-mpnet-base-v2.pkl',env='saved-dictionary')\n", "\n", "# Step 3: Add a feature transforming layer (optional)\n", "# A. Choose a feature-scaler. Available options: \n", "# any scikit-learn scaler from `sklearn.preprocessing`\n", "### iqual_model.add_feature_transformer(name='StandardScaler', transformation=\"FeatureScaler\")\n", "# OR\n", "# B. Choose a dimensionality reduction model. Available options:\n", "# - Any scikit-learn dimensionality reduction model from `sklearn.decomposition`\n", "# - Uniform Manifold Approximation and Projection (UMAP) using umap.UMAP (https://umap-learn.readthedocs.io/en/latest/)\n", "\n", "### iqual_model.add_feature_transformer(name='PCA', transformation=\"DimensionalityReduction\")\n", "\n", "# Step 4: Add a classifier layer\n", "# Choose a primary classifier model (Available options: any scikit-learn classifier)\n", "iqual_model.add_classifier(name=\"LogisticRegression\")\n", "\n", "# Step 5: Add a threshold layer. This is optional, but recommended for binary classification\n", "iqual_model.add_threshold(scoring_metric='f1')\n", "\n", "# Step 6: Compile the model\n", "iqual_model.compile()" ] }, { "cell_type": "markdown", "id": "0e7ce910", "metadata": {}, "source": [ "> ### Configure a Hyperparameter Grid for cross-validation + fitting" ] }, { "cell_type": "code", "execution_count": 12, "id": "f1317cc3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[{'Input__question__vectorizer__model': ['../../dictionaries\\\\en_core_web_sm.pkl', '../../dictionaries\\\\en_core_web_md.pkl', '../../dictionaries\\\\en_core_web_lg.pkl', '../../dictionaries\\\\all-mpnet-base-v2.pkl', '../../dictionaries\\\\all-roberta-large-v1.pkl', '../../dictionaries\\\\distiluse-base-multilingual-cased-v2.pkl'], 'Input__question__vectorizer__env': ['saved-dictionary'], 'Input__answer__vectorizer__model': ['../../dictionaries\\\\en_core_web_sm.pkl', '../../dictionaries\\\\en_core_web_md.pkl', '../../dictionaries\\\\en_core_web_lg.pkl', '../../dictionaries\\\\all-mpnet-base-v2.pkl', '../../dictionaries\\\\all-roberta-large-v1.pkl', '../../dictionaries\\\\distiluse-base-multilingual-cased-v2.pkl'], 'Input__answer__vectorizer__env': ['saved-dictionary'], 'Classifier__model': ['LogisticRegression'], 'Classifier__C': [0.01, 0.1]}, {'Input__question__vectorizer__model': ['TfidfVectorizer', 'CountVectorizer'], 'Input__question__vectorizer__max_features': [500, 1000, 1500, 2500], 'Input__question__vectorizer__env': ['scikit-learn'], 'Input__answer__vectorizer__model': ['TfidfVectorizer', 'CountVectorizer'], 'Input__answer__vectorizer__max_features': [1500, 2500, 4000], 'Input__answer__vectorizer__env': ['scikit-learn'], 'Classifier__model': ['LogisticRegression'], 'Classifier__C': [0.01, 0.1]}]\n" ] } ], "source": [ "model_paths = [os.path.join(dict_dir,model+'.pkl') for model in [*spacy_models,*sentence_transformer_models]]\n", "\n", "# Saved-dictionary (precomputed vectors using spacy/sentence-transformers)\n", "params_saved = {\n", " \"Input\":{\n", " \"question\":{\n", " \"vectorizer\":{\n", " \"model\":model_paths,\n", " \"env\":[\"saved-dictionary\"], \n", " },\n", " },\n", " \"answer\":{\n", " \"vectorizer\":{\n", " \"model\":model_paths,\n", " \"env\":[\"saved-dictionary\"], \n", " }, \n", " },\n", " },\n", " \n", " \"Classifier\":{\n", " \"model\":[\"LogisticRegression\"],\n", " \"C\":[0.01,0.1],\n", " },\n", "}\n", "\n", "# Scikit-learn vectors (TfidfVectorizer/CountVectorizer)\n", "\n", "params_sklearn = {\n", " \"Input\":{\n", " \"question\":{\n", " \"vectorizer\":{\n", " \"model\":['TfidfVectorizer','CountVectorizer'],\n", " \"max_features\":[500,1000,1500,2500,],\n", " \"env\":[\"scikit-learn\"], \n", " },\n", " },\n", " \"answer\":{\n", " \"vectorizer\":{\n", " \"model\":['TfidfVectorizer','CountVectorizer'],\n", " \"max_features\":[1500,2500,4000,],\n", " \"env\":[\"scikit-learn\"], \n", " }, \n", " },\n", " },\n", " \n", " \"Classifier\":{\n", " \"model\":[\"LogisticRegression\"],\n", " \"C\":[0.01,0.1],\n", " },\n", "}\n", "\n", "CV_SEARCH_PARAMS = [\n", " crossval.convert_nested_params(params_saved),\n", " crossval.convert_nested_params(params_sklearn)\n", "]\n", "print(CV_SEARCH_PARAMS)" ] }, { "cell_type": "markdown", "id": "5250bd6d", "metadata": {}, "source": [ "## Model training:\n", "Cross-validate over hyperparameters and select the best model" ] }, { "cell_type": "code", "execution_count": 36, "id": "fa163fed", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ ".......168 hyperparameters configurations possible.....\n", "Average F1 score: 0.835\n" ] } ], "source": [ "# Scoring Dict for evaluation\n", "scoring_dict = {'f1':evaluation.get_scorer('f1')}\n", "\n", "cv_dict = iqual_model.cross_validate_fit(\n", " X,y, # X: Pandas DataFrame of features, y: Pandas Series of labels\n", " search_parameters=CV_SEARCH_PARAMS, # search_parameters: Dictionary of parameters to use for cross-validation\n", " cv_method='RandomizedSearchCV', # cv_method: Cross-validation method to use, options: GridSearchCV, RandomizedSearchCV\n", " n_iter=50, # n_iter: Number of iterations for RandomizedSearchCV\n", " scoring=scoring_dict, # scoring: Scoring metric to use for cross-validation \n", " refit='f1', # refit: Metric to use for refitting the model\n", " n_jobs=-1, # n_jobs: Number of parallel threads to use \n", " cv_splits=3, # cv_splits: Number of cross-validation splits\n", ")\n", "print()\n", "print(\"Average F1 score: {:.3f}\".format(cv_dict['avg_test_score']))" ] }, { "cell_type": "markdown", "id": "569c0837", "metadata": {}, "source": [ "### Check best model" ] }, { "cell_type": "code", "execution_count": 39, "id": "6caefc80", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Q vectorizer: ../../dictionaries\\distiluse-base-multilingual-cased-v2.pkl\n", "A vectorizer: ../../dictionaries\\distiluse-base-multilingual-cased-v2.pkl\n", "Classifier: LogisticRegression\n", "Threshold for classification: 0.192\n" ] }, { "data": { "text/html": [ "
Pipeline(steps=[('Input',\n",
       "                 FeatureUnion(transformer_list=[('question',\n",
       "                                                 Pipeline(steps=[('selector',\n",
       "                                                                  FunctionTransformer(func=<function column_selector at 0x0000023B71C74280>,\n",
       "                                                                                      kw_args={'column_name': 'Q_en'})),\n",
       "                                                                 ('vectorizer',\n",
       "                                                                  Vectorizer(env='saved-dictionary',\n",
       "                                                                             model='../../dictionaries\\\\distiluse-base-multilingual-cased-v2.pkl'))])),\n",
       "                                                ('answer',\n",
       "                                                 Pipeline(steps=[('...\n",
       "                 Classifier(C=0.1, class_weight=None, dual=False,\n",
       "                            fit_intercept=True, intercept_scaling=1,\n",
       "                            l1_ratio=None, max_iter=100,\n",
       "                            model='LogisticRegression', multi_class='auto',\n",
       "                            n_jobs=None, penalty='l2', random_state=None,\n",
       "                            solver='lbfgs', tol=0.0001, verbose=0,\n",
       "                            warm_start=False)),\n",
       "                ('Threshold',\n",
       "                 BinaryThresholder(threshold=0.1919019469710422,\n",
       "                                   threshold_range=(0.004475209108360682,\n",
       "                                                    0.8112250807781638)))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "Pipeline(steps=[('Input',\n", " FeatureUnion(transformer_list=[('question',\n", " Pipeline(steps=[('selector',\n", " FunctionTransformer(func=,\n", " kw_args={'column_name': 'Q_en'})),\n", " ('vectorizer',\n", " Vectorizer(env='saved-dictionary',\n", " model='../../dictionaries\\\\distiluse-base-multilingual-cased-v2.pkl'))])),\n", " ('answer',\n", " Pipeline(steps=[('...\n", " Classifier(C=0.1, class_weight=None, dual=False,\n", " fit_intercept=True, intercept_scaling=1,\n", " l1_ratio=None, max_iter=100,\n", " model='LogisticRegression', multi_class='auto',\n", " n_jobs=None, penalty='l2', random_state=None,\n", " solver='lbfgs', tol=0.0001, verbose=0,\n", " warm_start=False)),\n", " ('Threshold',\n", " BinaryThresholder(threshold=0.1919019469710422,\n", " threshold_range=(0.004475209108360682,\n", " 0.8112250807781638)))])" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "params = iqual_model.get_params()\n", "\n", "print(f\"Q vectorizer: {params.get('Input__question__vectorizer__model','Dropped')}\")\n", "print(f\"A vectorizer: {params.get('Input__answer__vectorizer__model')}\")\n", "print(f\"Classifier: {params.get('Classifier__model')}\")\n", "print(f\"Threshold for classification: {params.get('Threshold__threshold'):.3f}\")\n", "\n", "iqual_model.model" ] }, { "cell_type": "markdown", "id": "c7575dd4", "metadata": {}, "source": [ "### Evaluate model using out sample data (Held out human-coded data)" ] }, { "cell_type": "code", "execution_count": 18, "id": "6e771963", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Out-sample F1-score: 0.795\n", "Out-sample accuracy-score: 0.981\n" ] } ], "source": [ "test_X = test_df[['Q_en','A_en']]\n", "test_y = test_df[code_variable]\n", "\n", "f1_score = iqual_model.score(test_X,test_y,\n", " scoring_function=evaluation.get_metric('f1_score'))\n", "print(f\"Out-sample F1-score: {f1_score:.3f}\")\n", "\n", "accuracy = iqual_model.score(test_X,test_y,\n", " scoring_function=evaluation.get_metric('accuracy_score'))\n", "print(f\"Out-sample accuracy-score: {accuracy:.3f}\")" ] }, { "cell_type": "markdown", "id": "93f3255a", "metadata": {}, "source": [ "### Predict labels for unannotated data" ] }, { "cell_type": "code", "execution_count": 19, "id": "626b81f4", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAN8AAADCCAYAAADJsRdpAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAAsTAAALEwEAmpwYAAAMjElEQVR4nO3db6xcdZ3H8fdniyBb/3Brd28aMLRkm5giWaCNNq5Zi2xorclWozElrhTsikrZP1keWJcHGFgjPFCTusrGXRtoohREjVWL3WvtjdkHBYp2W4rWllKz3FS6UoQtJLglXx+c39XT60zvzNyZ+c6dfl7J5J75nT/zvSf93Dnzu+f2q4jAzPrvj7ILMDtbOXxmSRw+syQOn1kSh88sicNnluSc7AI6NX/+/Fi4cGHDdS+++CJz587tb0EtcF3tma11PfbYY7+KiD+Z9kARMSsfS5cujWZ27drVdF0m19We2VoXsCda+Dfsy06zJA6fWRKHzyyJw2eWxOEzSzJrf9VwJvsnnuf6jd/LLuMP3HLZqbbrOnrnu3tUjWXzO59ZEofPLInDZ5bE4TNL4vCZJXH4zJI4fGZJpg2fpDdK2iXpCUkHJP1DGZ8naUzSofJ1pIxL0iZJhyXtk3Rl7VjryvaHJK2rjS+VtL/ss0mSevHNmg2SVt75TgG3RMQSYDmwQdISYCOwMyIWAzvLc4B3AYvL40bgbqjCCtwGvBV4C3DbZGDLNh+p7bdq5t+a2WCbNnwRcSwiflyW/w/4KXAhsAa4t2x2L/CesrwG2FL+tGk3cIGkBcBKYCwiTkTEc8AYsKqse11E7C5/C7WldiyzodXWZz5JC4ErgIeB0Yg4Vlb9EhgtyxcC/1Pb7ekydqbxpxuMmw21lu/tlPQa4BvAP0bEC/WPZRERknr+X19LupHqUpbR0VHGx8cbbjd6fnUf5aDppK5m32M3nTx5si+v065hr6ul8El6FVXwvhoR3yzDz0haEBHHyqXj8TI+AbyxtvtFZWwCWDFlfLyMX9Rg+z8QEV8GvgywbNmyWLFiRaPN+MJXv81n9w/ePeO3XHaq7bqOfnBFb4qpGR8fp9m5zDTsdbUy2yngK8BPI+JztVXbgMkZy3XAt2vj15VZz+XA8+XydAdwjaSRMtFyDbCjrHtB0vLyWtfVjmU2tFr5MfwXwIeA/ZL2lrF/Bu4EHpC0HvgF8IGybjuwGjgMvATcABARJyTdATxatrs9Ik6U5ZuAe4DzgYfKw2yoTRu+iPgvoNnv3a5usH0AG5ocazOwucH4HuDN09ViNkx8h4tZEofPLInDZ5bE4TNL4vCZJXH4zJI4fGZJHD6zJA6fWRKHzyyJw2eWxOEzS+LwmSVx+MySOHxmSRw+syQOn1kSh88sicNnlsThM0vi8JklcfjMkjh8ZkkcPrMkDp9ZEofPLInDZ5aklS5FmyUdl/R4bexTkiYk7S2P1bV1nyy91Q9KWlkbX1XGDkvaWBtfJOnhMn6/pHO7+Q2aDapW3vnuoXGP9M9HxOXlsR2g9GpfC1xa9vmSpDmS5gBfpOrXvgS4tmwLcFc51p8BzwHrZ/INmc0WrfRk/xFwYrrtijXA1oh4OSKeomoT9pbyOBwRRyLiN8BWYE3px/dO4MGyf723u9lQm8lnvpsl7SuXpSNlrN1+7G8Afh0Rp6aMmw29Tnsn3w3cAUT5+lngw90qqhn3ZO+NYe993m197ck+VUQ8M7ks6d+B75anzfqx02T8WeACSeeUd7+m/djL67onew8Me+/zbutbT/ZGJC2oPX0vMDkTug1YK+k8SYuAxcAjVK2gF5eZzXOpJmW2lS62u4D3l/3rvd3Nhtq0P4Yl3QesAOZLehq4DVgh6XKqy86jwEcBIuKApAeAJ4BTwIaIeKUc52ZgBzAH2BwRB8pLfALYKulfgJ8AX+nWN2c2yFrpyX5tg+GmAYmITwOfbjC+HdjeYPwI1Wyo2VnFd7iYJXH4zJI4fGZJHD6zJA6fWRKHzyyJw2eWxOEzS+LwmSVx+MySOHxmSRw+syQOn1kSh88sicNnlsThM0vi8JklcfjMkjh8ZkkcPrMkDp9ZEofPLInDZ5bE4TNL4vCZJXH4zJI4fGZJOu3JPk/SmKRD5etIGZekTaW/+j5JV9b2WVe2PyRpXW18qaT9ZZ9NpVut2dDrtCf7RmBnRCwGdpbnUPVcX1weN1I10UTSPKruRm+laopyW62b7d3AR2r7Ner/bjZ0Ou3Jvoaqfzqc3kd9DbAlKrupGl8uAFYCYxFxIiKeA8aAVWXd6yJid+nVtwX3ZLezRKef+UYj4lhZ/iUwWpbb7cl+YVmeOm429GbcOzkiQlJ0o5jpuCd7bwx77/NuS+3JDjwjaUFEHCuXjsfLeLOe7BNU3W3r4+Nl/KIG2zfknuy9Mey9z7sttSc7Ve/1yRnLeh/1bcB1ZdZzOfB8uTzdAVwjaaRMtFwD7CjrXpC0vMxyXod7sttZotOe7HcCD0haD/wC+EDZfDuwGjgMvATcABARJyTdATxatrs9IiYncW6imlE9H3ioPMyGXqc92QGubrBtABuaHGczsLnB+B7gzdPVYTZsfIeLWRKHzyyJw2eWxOEzS+LwmSVx+MySOHxmSRw+syQOn1kSh88sicNnlsThM0vi8JklcfjMkjh8ZkkcPrMkDp9ZEofPLInDZ5bE4TNL4vCZJXH4zJI4fGZJHD6zJA6fWRKHzyyJw2eWZEbhk3S09FPfK2lPGetav3azYdaNd76rIuLyiFhWnnezX7vZ0OrFZWdX+rX3oC6zgTLT8AXwn5IeKy2boXv92s2G2kx7J789IiYk/SkwJuln9ZXd7tfunuy9Mey9z7stuyc7ABExUb4el/Qtqs9s3erX3uj13JO9B4a993m3ZfdkR9JcSa+dXKbqs/44XerX3mldZrPFTN4eRoFvSZo8ztci4vuSHqV7/drNhlbH4YuII8CfNxh/li71azcbZr7DxSyJw2eWxOEzS+LwmSVx+MySOHxmSRw+syQOn1kSh88sicNnlsThM0vi8JklcfjMkjh8ZkkcPrMkDp9ZEofPLInDZ5bE4TNL4vCZJXH4zJI4fGZJHD6zJA6fWZLBa2hgQ2vhxu+1tf0tl53i+jb36Yd7Vs3tynH8zmeWxOEzSzIw4ZO0StLB0rN94/R7mM1uAxE+SXOAL1L1bV8CXCtpSW5VZr01EOGjaqp5OCKORMRvgK1UPdzNhtaghM992e2sM6t+1VDvyQ6clHSwyabzgV/1p6rW/X0HdemuHhVzuqE5X/1w1V3T1nVxK8cZlPA169d+mnpP9jORtCcilnWvvO5wXe0Z9roG5bLzUWCxpEWSzgXWUvVwNxtaA/HOFxGnJN0M7ADmAJsj4kByWWY9NRDhA4iI7cD2Lh1u2kvTJK6rPUNdlyKiG8cxszYNymc+s7POrAvfdLehSTpP0v1l/cOSFtbWfbKMH5S0ss91/ZOkJyTtk7RT0sW1da9I2lseXZ1oaqGu6yX9b+31/7a2bp2kQ+Wxrs91fb5W088l/bq2rpfna7Ok45Ieb7JekjaVuvdJurK2rr3zFRGz5kE1GfMkcAlwLvDfwJIp29wE/FtZXgvcX5aXlO3PAxaV48zpY11XAX9clj8+WVd5fjLxfF0P/GuDfecBR8rXkbI80q+6pmz/d1STcD09X+XYfwlcCTzeZP1q4CFAwHLg4U7P12x752vlNrQ1wL1l+UHgakkq41sj4uWIeAo4XI7Xl7oiYldEvFSe7qb6XWavzeS2vZXAWESciIjngDFgVVJd1wL3dem1zygifgScOMMma4AtUdkNXCBpAR2cr9kWvlZuQ/vdNhFxCngeeEOL+/ayrrr1VD89J71a0h5JuyW9p0s1tVPX+8ol1IOSJm92GIjzVS7PFwE/rA336ny1olntbZ+vgflVw9lC0t8Ay4B31IYvjogJSZcAP5S0PyKe7FNJ3wHui4iXJX2U6qrhnX167VasBR6MiFdqY5nnq2tm2ztfK7eh/W4bSecArweebXHfXtaFpL8CbgX+OiJenhyPiIny9QgwDlzRr7oi4tlaLf8BLG11317WVbOWKZecPTxfrWhWe/vnq1cfXHv0Yfgcqg+yi/j9B/VLp2yzgdMnXB4oy5dy+oTLEbo34dJKXVdQTTIsnjI+ApxXlucDhzjD5EMP6lpQW34vsDt+P4HwVKlvpCzP61ddZbs3AUcpv4/u9fmqvcZCmk+4vJvTJ1we6fR8pQeqgxOzGvh5+Yd8axm7nerdBODVwNepJlQeAS6p7Xtr2e8g8K4+1/UD4Blgb3lsK+NvA/aXf4D7gfV9ruszwIHy+ruAN9X2/XA5j4eBG/pZV3n+KeDOKfv1+nzdBxwD/p/qc9t64GPAx8p6Uf3h95Pl9Zd1er58h4tZktn2mc9saDh8ZkkcPrMkDp9ZEofPLInDZ5bE4TNL4vCZJfkto2Bpb0H2NKgAAAAASUVORK5CYII=\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "uncoded_df[code_variable+'_pred'] = iqual_model.predict(uncoded_df[['Q_en','A_en']])\n", "\n", "uncoded_df[code_variable+\"_pred\"].hist(figsize=(3,3),bins=3)" ] }, { "cell_type": "markdown", "id": "ccf2a6dc", "metadata": {}, "source": [ "### Examples for positive predictions" ] }, { "cell_type": "code", "execution_count": 20, "id": "b1e02a70", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Q: Well, currently studying in the sixth standard. How long will you teach?\n", "A: If it is God's command, I will marry now. If there is no order to stay until next year, then I will teach until next year.\n", "\n", "Q: What are your hopes for your children?\n", "A: Now I am tensed, my daughter has grown up, I am not able to get married. Son wants to read, but can't read. School is not open, nowhere is open, can't teach. Can't teach Arabic, Mugh (Burmese).\n", "\n", "Q: You said it. What career will you be happy after completing your studies?\n", "A: I will be happy if I can get married after my studies.\n", "\n" ] } ], "source": [ "for idx, row in uncoded_df.loc[(uncoded_df[code_variable+\"_pred\"]==1),['Q_en','A_en']].sample(3).iterrows():\n", " print(\"Q: \",row['Q_en'],\"\\n\",\"A: \", row['A_en'],sep='')\n", " print()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.13" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "state": {}, "version_major": 2, "version_minor": 0 } } }, "nbformat": 4, "nbformat_minor": 5 }