{ "cells": [ { "cell_type": "markdown", "id": "486801ce", "metadata": {}, "source": [ "# Model - SpaCy" ] }, { "cell_type": "markdown", "id": "f1a813d7", "metadata": {}, "source": [ "### Imports" ] }, { "cell_type": "code", "execution_count": 1, "id": "8eb77ef5", "metadata": {}, "outputs": [], "source": [ "import os\n", "import pandas as pd\n", "from iqual import iqualnlp, evaluation, crossval" ] }, { "cell_type": "markdown", "id": "17a42bc8", "metadata": {}, "source": [ "### Load `annotated (human-coded)` and `unannotated` datasets" ] }, { "cell_type": "code", "execution_count": 2, "id": "a7d035ab", "metadata": {}, "outputs": [], "source": [ "data_dir = \"../../data\"\n", "\n", "human_coded_df = pd.read_csv(os.path.join(data_dir,\"annotated.csv\"))\n", "uncoded_df = pd.read_csv(os.path.join(data_dir,\"unannotated.csv\"))" ] }, { "cell_type": "markdown", "id": "9b308f43", "metadata": {}, "source": [ "### Split the data into training and test sets" ] }, { "cell_type": "code", "execution_count": 3, "id": "e0f95233", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train Size: 7470\n", "Test Size: 2490\n" ] } ], "source": [ "from sklearn.model_selection import train_test_split\n", "train_df, test_df = train_test_split(human_coded_df,test_size=0.25)\n", "print(f\"Train Size: {len(train_df)}\\nTest Size: {len(test_df)}\")" ] }, { "cell_type": "markdown", "id": "f0ea3ad2", "metadata": {}, "source": [ "### Configure training data" ] }, { "cell_type": "code", "execution_count": 4, "id": "ebffbf2c", "metadata": {}, "outputs": [], "source": [ "### Select Question and Answer Columns\n", "question_col = 'Q_en'\n", "answer_col = 'A_en'\n", "\n", "### Select a code\n", "code_variable = 'marriage'\n", "\n", "### Create X and y\n", "X = train_df[[question_col,answer_col]]\n", "y = train_df[code_variable]" ] }, { "cell_type": "raw", "id": "fa7f9df4", "metadata": {}, "source": [ "# NOTE: Make sure to download spacy language models before using them.\n", "\n", "# English - Small\n", "!python -m spacy download en_core_web_sm \n", "\n", "# English - Medium\n", "!python -m spacy download en_core_web_md\n", "\n", "# English - Large\n", "!python -m spacy download en_core_web_lg" ] }, { "cell_type": "markdown", "id": "23910543", "metadata": {}, "source": [ "### Initiate model" ] }, { "cell_type": "code", "execution_count": 5, "id": "ea7cd53b", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Pipeline(steps=[('Input',\n",
       "                 FeatureUnion(transformer_list=[('question',\n",
       "                                                 Pipeline(steps=[('selector',\n",
       "                                                                  FunctionTransformer(func=<function column_selector at 0x00000274D4B7A8B0>,\n",
       "                                                                                      kw_args={'column_name': 'Q_en'})),\n",
       "                                                                 ('vectorizer',\n",
       "                                                                  Vectorizer(env='spacy',\n",
       "                                                                             model='en_core_web_sm'))])),\n",
       "                                                ('answer',\n",
       "                                                 Pipeline(steps=[('selector',\n",
       "                                                                  FunctionTransformer(func=<function column_select...\n",
       "                ('FeatureTransformation', FeatureScaler(copy=True, norm='l2')),\n",
       "                ('Classifier',\n",
       "                 Classifier(C=1.0, class_weight=None, dual=False,\n",
       "                            fit_intercept=True, intercept_scaling=1,\n",
       "                            l1_ratio=None, max_iter=100,\n",
       "                            model='LogisticRegression', multi_class='auto',\n",
       "                            n_jobs=None, penalty='l2', random_state=None,\n",
       "                            solver='lbfgs', tol=0.0001, verbose=0,\n",
       "                            warm_start=False)),\n",
       "                ('Threshold', BinaryThresholder())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "Pipeline(steps=[('Input',\n", " FeatureUnion(transformer_list=[('question',\n", " Pipeline(steps=[('selector',\n", " FunctionTransformer(func=,\n", " kw_args={'column_name': 'Q_en'})),\n", " ('vectorizer',\n", " Vectorizer(env='spacy',\n", " model='en_core_web_sm'))])),\n", " ('answer',\n", " Pipeline(steps=[('selector',\n", " FunctionTransformer(func=" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAN8AAADCCAYAAADJsRdpAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAAsTAAALEwEAmpwYAAAMv0lEQVR4nO3dX4xc9XnG8e9TUwh1m7KO25UFETaqpcgEBfAqsdqqNaHCjiPViRpFRmkxxI2TYvpH9UWdckEEjQoXNJLTlIo2FliiGEob4TSm7tbxKsqFAdO6Xkzq2BhHxXJwgwnUIJEuentxfpscuzPemdmZeWfHz0cazZnf+TPvHvnxnPntsV9FBGbWfz+VXYDZhcrhM0vi8JklcfjMkjh8ZkkcPrMkF2UX0KmFCxfG4sWLG6578803mT9/fn8LaoHras9creu55577QUT8wowHiog5+Vi+fHk0s3fv3qbrMrmu9szVuoD90cKfYV92miVx+MySOHxmSRw+syQOn1mSOfurhvOZPPE6t275RnYZ/8/ma6baruv4vR/tUTWWzZ98ZkkcPrMkDp9ZEofPLInDZ5bE4TNL4vCZJZkxfJLeK2mvpBckHZL0h2V8gaRxSUfK80gZl6Stko5KOijp+tqx1pftj0haXxtfLmmy7LNVknrxw5oNklY++aaAzRGxDFgBbJK0DNgC7ImIpcCe8hrgI8DS8tgIPABVWIG7gA8BHwTumg5s2eYztf1Wz/5HMxtsM4YvIk5GxL+V5f8BvgNcDqwFHi6bPQx8rCyvBbaXf9q0D7hM0iJgFTAeEacj4jVgHFhd1r07IvaVfwu1vXYss6HV1u1lkhYD1wFPA6MRcbKs+j4wWpYvB/6rttvLZex84y83GG/0/hupPk0ZHR1lYmKiYZ2jl1a3cg2aTupq9jN205kzZ/ryPu0a9rpaDp+knwX+AfijiHij/rUsIkJSz//r64h4EHgQYGxsLFauXNlwuy8/8iT3Tw7ebaubr5lqu67jn1rZm2JqJiYmaHYuMw17XS3Ndkr6aargPRIR/1iGXymXjJTnU2X8BPDe2u5XlLHzjV/RYNxsqLUy2yngq8B3IuIvaqt2AtMzluuBJ2vjt5RZzxXA6+XydDdwk6SRMtFyE7C7rHtD0oryXrfUjmU2tFq5BvoV4HeASUkHytifAvcCj0vaAHwP+GRZtwtYAxwF3gJuA4iI05LuAZ4t290dEafL8u3AQ8ClwFPlYTbUZgxfRHwbaPZ7txsbbB/ApibH2gZsazC+H3j/TLWYDRPf4WKWxOEzS+LwmSVx+MySOHxmSRw+syQOn1kSh88sicNnlsThM0vi8JklcfjMkjh8ZkkcPrMkDp9ZEofPLInDZ5bE4TNL4vCZJXH4zJI4fGZJHD6zJA6fWRKHzyyJw2eWxOEzS+LwmSVppUvRNkmnJD1fG/uCpBOSDpTHmtq6z5fe6oclraqNry5jRyVtqY0vkfR0GX9M0sXd/AHNBlUrn3wP0bhH+pci4try2AVQerWvA64u+/yVpHmS5gFfoerXvgy4uWwLcF851i8BrwEbZvMDmc0VrfRk/xZweqbtirXAjoh4OyJeomoT9sHyOBoRxyLiR8AOYG3px/dh4Imyf723u9lQm813vjskHSyXpSNlrN1+7O8BfhgRU+eMmw29ThuXPwDcA0R5vh/4dLeKakbSRmAjwOjoaNOm9KOXVv3PB00ndTX7GbvpzJkzfXmfdg17XR2FLyJemV6W9DfAP5WXzfqu02T8VeAySReVT7/z9mOPiAeBBwHGxsaiWVP6Lz/yJPdPdvr3Su9svmaq7bqOf2plb4qpmZiYoNm5zDTsdXV02SlpUe3lx4HpmdCdwDpJl0haAiwFnqFqBb20zGxeTDUps7N0sd0LfKLsX+/tbjbUZvxrWNKjwEpgoaSXgbuAlZKupbrsPA58FiAiDkl6HHgBmAI2RcQ75Th3ALuBecC2iDhU3uJPgB2S/gz4d+Cr3frhzAZZKz3Zb24w3DQgEfFF4IsNxncBuxqMH6OaDTW7oPgOF7MkDp9ZEofPLInDZ5bE4TNL4vCZJXH4zJI4fGZJHD6zJA6fWRKHzyyJw2eWxOEzS+LwmSVx+MySOHxmSRw+syQOn1kSh88sicNnlsThM0vi8JklcfjMkjh8ZkkcPrMkDp9ZEofPLEmnPdkXSBqXdKQ8j5RxSdpa+qsflHR9bZ/1ZfsjktbXxpdLmiz7bC3das2GXqc92bcAeyJiKbCnvIaq5/rS8thI1UQTSQuouht9iKopyl21brYPAJ+p7deo/7vZ0Om0J/taqv7pcHYf9bXA9qjso2p8uQhYBYxHxOmIeA0YB1aXde+OiH2lV9923JPdLhCdfucbjYiTZfn7wGhZbrcn++Vl+dxxs6E3697JERGSohvFzMQ92Xtj2Hufd1tqT3bgFUmLIuJkuXQ8Vcab9WQ/QdXdtj4+UcavaLB9Q+7J3hvD3vu821J7slP1Xp+esaz3Ud8J3FJmPVcAr5fL093ATZJGykTLTcDusu4NSSvKLOctuCe7XSA67cl+L/C4pA3A94BPls13AWuAo8BbwG0AEXFa0j3As2W7uyNiehLndqoZ1UuBp8rDbOh12pMd4MYG2wawqclxtgHbGozvB94/Ux1mw8Z3uJglcfjMkjh8ZkkcPrMkDp9ZEofPLInDZ5bE4TNL4vCZJXH4zJI4fGZJHD6zJA6fWRKHzyyJw2eWxOEzS+LwmSVx+MySOHxmSRw+syQOn1kSh88sicNnlsThM0vi8JklcfjMkjh8ZklmFT5Jx0s/9QOS9pexrvVrNxtm3fjkuyEiro2IsfK6m/3azYZWLy47u9KvvQd1mQ2U2YYvgH+R9Fxp2Qzd69duNtRm2zv5VyPihKRfBMYl/Wd9Zbf7tbsne28Me+/zbsvuyQ5ARJwoz6ckfY3qO1u3+rU3ej/3ZO+BYe993m3ZPdmRNF/Sz00vU/VZf54u9WvvtC6zuWI2Hw+jwNckTR/n7yLinyU9S/f6tZsNrY7DFxHHgA80GH+VLvVrNxtmvsPFLInDZ5bE4TNL4vCZJXH4zJI4fGZJHD6zJA6fWRKHzyyJw2eWxOEzS+LwmSVx+MySOHxmSRw+syQOn1kSh88sicNnlsThM0vi8JklcfjMkjh8ZkkcPrMkDp9ZksFraGBDa/GWb7S1/eZrpri1zX364aHV87tyHH/ymSVx+MySDEz4JK2WdLj0bN8y8x5mc9tAhE/SPOArVH3blwE3S1qWW5VZbw1E+Kiaah6NiGMR8SNgB1UPd7OhNSjhc192u+DMqV811HuyA2ckHW6y6ULgB/2pqnV/0EFduq9HxZxtaM5XP9xw34x1XdnKcQYlfM36tZ+l3pP9fCTtj4ix7pXXHa6rPcNe16Bcdj4LLJW0RNLFwDqqHu5mQ2sgPvkiYkrSHcBuYB6wLSIOJZdl1lMDET6AiNgF7OrS4Wa8NE3iutoz1HUpIrpxHDNr06B85zO74My58M10G5qkSyQ9VtY/LWlxbd3ny/hhSav6XNcfS3pB0kFJeyRdWVv3jqQD5dHViaYW6rpV0n/X3v93a+vWSzpSHuv7XNeXajV9V9IPa+t6eb62STol6fkm6yVpa6n7oKTra+vaO18RMWceVJMxLwJXARcD/wEsO2eb24G/LsvrgMfK8rKy/SXAknKceX2s6wbgZ8ry703XVV6fSTxftwJ/2WDfBcCx8jxSlkf6Vdc52/8+1SRcT89XOfavAdcDzzdZvwZ4ChCwAni60/M11z75WrkNbS3wcFl+ArhRksr4joh4OyJeAo6W4/WlrojYGxFvlZf7qH6X2WuzuW1vFTAeEacj4jVgHFidVNfNwKNdeu/ziohvAafPs8laYHtU9gGXSVpEB+drroWvldvQfrxNREwBrwPvaXHfXtZVt4Hqb89p75K0X9I+SR/rUk3t1PVb5RLqCUnTNzsMxPkql+dLgG/Whnt1vlrRrPa2z9fA/KrhQiHpt4Ex4Ndrw1dGxAlJVwHflDQZES/2qaSvA49GxNuSPkt11fDhPr13K9YBT0TEO7WxzPPVNXPtk6+V29B+vI2ki4CfB15tcd9e1oWk3wDuBH4zIt6eHo+IE+X5GDABXNevuiLi1Votfwssb3XfXtZVs45zLjl7eL5a0az29s9Xr7649ujL8EVUX2SX8JMv6lefs80mzp5webwsX83ZEy7H6N6ESyt1XUc1ybD0nPER4JKyvBA4wnkmH3pQ16La8seBffGTCYSXSn0jZXlBv+oq270POE75fXSvz1ftPRbTfMLlo5w94fJMp+crPVAdnJg1wHfLH+Q7y9jdVJ8mAO8C/p5qQuUZ4KravneW/Q4DH+lzXf8KvAIcKI+dZfyXgcnyB3AS2NDnuv4cOFTefy/wvtq+ny7n8ShwWz/rKq+/ANx7zn69Pl+PAieB/6X63rYB+BzwubJeVP/w+8Xy/mOdni/f4WKWZK595zMbGg6fWRKHzyyJw2eWxOEzS+LwmSVx+MySOHxmSf4PtSdljY5i/McAAAAASUVORK5CYII=\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "uncoded_df[code_variable+'_pred'] = iqual_model.predict(uncoded_df[['Q_en','A_en']])\n", "uncoded_df[code_variable+\"_pred\"].hist(figsize=(3,3),bins=3)" ] }, { "cell_type": "markdown", "id": "dd2e083f", "metadata": {}, "source": [ "### Examples for positive predictions" ] }, { "cell_type": "code", "execution_count": 13, "id": "b1e02a70", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Q: ok Teach your daughter till SSC. But what kind of work do you expect to do after finishing your studies?\n", "A: think If there is a proposal, I will marry. If the girl does not want to marry. Will work. Then I will allow the job.\n", "\n", "Q: Your daughter is now studying in higher secondary. What kind of work do you expect him to finish his studies?\n", "A: When she goes to her in-laws house, she will do what her husband does.\n", "\n", "Q: He tells us that he has hopes and dreams about his children.\n", "A: I will marry the girls beautifully after seeing good boys. And let the boys study, that's all.\n", "\n" ] } ], "source": [ "for idx, row in uncoded_df.loc[(uncoded_df[code_variable+\"_pred\"]==1),['Q_en','A_en']].sample(3).iterrows():\n", " print(\"Q: \",row['Q_en'],\"\\n\",\"A: \", row['A_en'],sep='')\n", " print()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.13" } }, "nbformat": 4, "nbformat_minor": 5 }