Model - SpaCy
Contents
Model - SpaCy#
Imports#
import os
import pandas as pd
from iqual import iqualnlp, evaluation, crossval
Load annotated (human-coded)
and unannotated
datasets#
data_dir = "../../data"
human_coded_df = pd.read_csv(os.path.join(data_dir,"annotated.csv"))
uncoded_df = pd.read_csv(os.path.join(data_dir,"unannotated.csv"))
Split the data into training and test sets#
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(human_coded_df,test_size=0.25)
print(f"Train Size: {len(train_df)}\nTest Size: {len(test_df)}")
Train Size: 7470
Test Size: 2490
Configure training data#
### Select Question and Answer Columns
question_col = 'Q_en'
answer_col = 'A_en'
### Select a code
code_variable = 'marriage'
### Create X and y
X = train_df[[question_col,answer_col]]
y = train_df[code_variable]
Initiate model#
# Step 1: Initiate the model class
iqual_model = iqualnlp.Model()
# Step 2: Add layers to the model
# Add text columns, and choose a feature extraction model (Available options: scikit-learn, spacy, sentence-transformers, saved-dictionary (picklized dictionary))
iqual_model.add_text_features(question_col,answer_col,model='en_core_web_sm',env='spacy')
# Step 3: Add a feature transforming layer (optional)
# A. Choose a feature-scaler. Available options:
# any scikit-learn scaler from `sklearn.preprocessing`
iqual_model.add_feature_transformer(name='Normalizer', transformation="FeatureScaler")
# OR
# B. Choose a dimensionality reduction model. Available options:
# - Any scikit-learn dimensionality reduction model from `sklearn.decomposition`
# - Uniform Manifold Approximation and Projection (UMAP) using umap.UMAP (https://umap-learn.readthedocs.io/en/latest/)
## iqual_model.add_feature_transformer(name='PCA', transformation="DimensionalityReduction")
# Step 4: Add a classifier layer
# Choose a primary classifier model (Available options: any scikit-learn classifier)
iqual_model.add_classifier(name = "SGDClassifier")
# Step 5: Add a threshold layer. This is optional, but recommended for binary classification
iqual_model.add_threshold(scoring_metric='f1')
# Step 6: Compile the model
iqual_model.compile()
Pipeline(steps=[('Input', FeatureUnion(transformer_list=[('question', Pipeline(steps=[('selector', FunctionTransformer(func=<function column_selector at 0x00000274D4B7A8B0>, kw_args={'column_name': 'Q_en'})), ('vectorizer', Vectorizer(env='spacy', model='en_core_web_sm'))])), ('answer', Pipeline(steps=[('selector', FunctionTransformer(func=<function column_select... ('FeatureTransformation', FeatureScaler(copy=True, norm='l2')), ('Classifier', Classifier(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=100, model='LogisticRegression', multi_class='auto', n_jobs=None, penalty='l2', random_state=None, solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)), ('Threshold', BinaryThresholder())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('Input', FeatureUnion(transformer_list=[('question', Pipeline(steps=[('selector', FunctionTransformer(func=<function column_selector at 0x00000274D4B7A8B0>, kw_args={'column_name': 'Q_en'})), ('vectorizer', Vectorizer(env='spacy', model='en_core_web_sm'))])), ('answer', Pipeline(steps=[('selector', FunctionTransformer(func=<function column_select... ('FeatureTransformation', FeatureScaler(copy=True, norm='l2')), ('Classifier', Classifier(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=100, model='LogisticRegression', multi_class='auto', n_jobs=None, penalty='l2', random_state=None, solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)), ('Threshold', BinaryThresholder())])
FeatureUnion(transformer_list=[('question', Pipeline(steps=[('selector', FunctionTransformer(func=<function column_selector at 0x00000274D4B7A8B0>, kw_args={'column_name': 'Q_en'})), ('vectorizer', Vectorizer(env='spacy', model='en_core_web_sm'))])), ('answer', Pipeline(steps=[('selector', FunctionTransformer(func=<function column_selector at 0x00000274D4B7A8B0>, kw_args={'column_name': 'A_en'})), ('vectorizer', Vectorizer(env='spacy', model='en_core_web_sm'))]))])
FunctionTransformer(func=<function column_selector at 0x00000274D4B7A8B0>, kw_args={'column_name': 'Q_en'})
Vectorizer(env='spacy', model='en_core_web_sm')
FunctionTransformer(func=<function column_selector at 0x00000274D4B7A8B0>, kw_args={'column_name': 'A_en'})
Vectorizer(env='spacy', model='en_core_web_sm')
FeatureScaler(copy=True, norm='l2')
Classifier(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=100, model='LogisticRegression', multi_class='auto', n_jobs=None, penalty='l2', random_state=None, solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)
BinaryThresholder()
Configure a Hyperparameter Grid for cross-validation + fitting#
search_param_config = {
"Input":{
"question":{
"vectorizer":{
"model":["en_core_web_sm","en_core_web_md"],
"env":["spacy"],
},
},
"answer":{
"vectorizer":{
"model":["en_core_web_sm","en_core_web_md"],
"env":["spacy"],
},
},
},
"Classifier":{
"model":["SGDClassifier"],
"loss":['modified_huber'],
"tol":[1e-2],
"alpha":[1e-4,1e-2,1e-1],
"max_iter":[100],
},
}
CV_SEARCH_PARAMS = crossval.convert_nested_params(search_param_config)
print(CV_SEARCH_PARAMS)
{'Input__question__vectorizer__model': ['en_core_web_sm', 'en_core_web_md'], 'Input__question__vectorizer__env': ['spacy'], 'Input__answer__vectorizer__model': ['en_core_web_sm', 'en_core_web_md'], 'Input__answer__vectorizer__env': ['spacy'], 'Classifier__model': ['SGDClassifier'], 'Classifier__loss': ['modified_huber'], 'Classifier__tol': [0.01], 'Classifier__alpha': [0.0001, 0.01, 0.1], 'Classifier__max_iter': [100]}
Model training:#
Cross-validate over hyperparameters and select the best model
# Scoring Dict for evaluation
scoring_dict = {'f1':evaluation.get_scorer('f1')}
cv_dict = iqual_model.cross_validate_fit(
X,y, # X: Pandas DataFrame of features, y: Pandas Series of labels
search_parameters=CV_SEARCH_PARAMS, # search_parameters: Dictionary of parameters to use for cross-validation
cv_method='GridSearchCV', # cv_method: Cross-validation method to use, options: GridSearchCV, RandomizedSearchCV
scoring=scoring_dict, # scoring: Scoring metric to use for cross-validation
refit='f1', # refit: Metric to use for refitting the model
n_jobs=-1, # n_jobs: Number of parallel threads to use
cv_splits=3, # cv_splits: Number of cross-validation splits
)
print()
print("Average F1 score: {:.3f}".format(cv_dict['avg_test_score']))
.......12 hyperparameters configurations possible.....
Average F1 score: 0.610
Evaluate model using out sample data (Held out human-coded data)#
test_X = test_df[['Q_en','A_en']]
test_y = test_df[code_variable]
f1_metric = evaluation.get_metric('f1_score')
f1_score = iqual_model.score(test_X,test_y,scoring_function=f1_metric)
print(f"Out-sample F1-score: {f1_score:.3f}")
Out-sample F1-score: 0.636
Predict labels for unannotated data#
uncoded_df[code_variable+'_pred'] = iqual_model.predict(uncoded_df[['Q_en','A_en']])
uncoded_df[code_variable+"_pred"].hist(figsize=(3,3),bins=3)
<AxesSubplot:>
Examples for positive predictions#
for idx, row in uncoded_df.loc[(uncoded_df[code_variable+"_pred"]==1),['Q_en','A_en']].sample(3).iterrows():
print("Q: ",row['Q_en'],"\n","A: ", row['A_en'],sep='')
print()
Q: ok Teach your daughter till SSC. But what kind of work do you expect to do after finishing your studies?
A: think If there is a proposal, I will marry. If the girl does not want to marry. Will work. Then I will allow the job.
Q: Your daughter is now studying in higher secondary. What kind of work do you expect him to finish his studies?
A: When she goes to her in-laws house, she will do what her husband does.
Q: He tells us that he has hopes and dreams about his children.
A: I will marry the girls beautifully after seeing good boys. And let the boys study, that's all.