Model - Multiple Annotations
Contents
Model - Multiple Annotations#
Imports#
import os
import pandas as pd
from iqual import iqualnlp, evaluation, crossval
Load annotated (human-coded)
and unannotated
datasets#
data_dir = "../../data"
human_coded_df = pd.read_csv(os.path.join(data_dir,"annotated.csv"))
uncoded_df = pd.read_csv(os.path.join(data_dir,"unannotated.csv"))
Split the data into training and test sets#
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(human_coded_df,test_size=0.25)
print(f"Train Size: {len(train_df)}\nTest Size: {len(test_df)}")
Train Size: 7470
Test Size: 2490
Configure training data#
### Select Question and Answer Columns
question_col = 'Q_en'
answer_col = 'A_en'
### Select a code
code_variables = ['religious','migration','entrepreneur','secular','marriage']
# Scoring Dict for evaluation
scoring_dict = {'f1':evaluation.get_scorer('f1')}
Configure a Hyperparameter Grid for cross-validation + fitting#
## Paths for precomputed vectors created using `sentence-transformers`
dict_dir = "../../dictionaries"
sbert_models = ["all-mpnet-base-v2", "distiluse-base-multilingual-cased-v2"]
sbert_model_paths = [os.path.join(dict_dir,m+'.pkl') for m in sbert_models]
SBERT_QA_PARAMS = {
"Input":{
"question":{
"vectorizer":{
"model":sbert_model_paths,
"env":["saved-dictionary"],
},
},
"answer":{
"vectorizer":{
"model":sbert_model_paths,
"env":["saved-dictionary"],
},
},
}
}
SBERT_A_PARAMS = {
"Input":{
"question":"drop",
"answer":{
"vectorizer":{
"model":sbert_model_paths,
"env":["saved-dictionary"],
},
}
}
}
SKLEARN_QA_PARAMS = {
"Input":{
"question":{
"vectorizer":{
"model":['TfidfVectorizer','CountVectorizer'],
"max_features":[500,1000,1500,2500,],
"env":["scikit-learn"],
},
},
"answer":{
"vectorizer":{
"model":['TfidfVectorizer','CountVectorizer'],
"max_features":[1500,2500,4000,],
"env":["scikit-learn"],
},
},
}
}
SKLEARN_A_PARAMS = {
"Input":{
"question":"drop",
"answer":{
"vectorizer":{
"model":['TfidfVectorizer','CountVectorizer'],
"max_features":[1500,2500,4000,],
"env":["scikit-learn"],
},
}
}
}
LOGISTIC_PARAMS = {
"Classifier":{
"model":["LogisticRegression"],
"C":[0.01,0.1],
},
}
RANDOM_FOREST_PARAMS = {
"Classifier":{
"model":["RandomForestClassifier"],
"n_estimators":[100,200],
"max_depth":[5,10,15],
},
}
SGD_PARAMS = {
"Classifier":{
"model":["SGDClassifier"],
"loss":["hinge","log"],
"alpha":[0.0001,0.001],
},
}
### Combine a Vectorizer and Classifier
VECTORIZATION_PARAMS = [SKLEARN_QA_PARAMS,SKLEARN_A_PARAMS,SBERT_QA_PARAMS,SBERT_A_PARAMS]
CLASSIFIER_PARAMS = [LOGISTIC_PARAMS,RANDOM_FOREST_PARAMS,SGD_PARAMS]
params_all = [{**vect_params, **clf_params} for vect_params in VECTORIZATION_PARAMS for clf_params in CLASSIFIER_PARAMS]
CV_SEARCH_PARAMS = [crossval.convert_nested_params(params) for params in params_all]
print(CV_SEARCH_PARAMS)
[{'Input__question__vectorizer__model': ['TfidfVectorizer', 'CountVectorizer'], 'Input__question__vectorizer__max_features': [500, 1000, 1500, 2500], 'Input__question__vectorizer__env': ['scikit-learn'], 'Input__answer__vectorizer__model': ['TfidfVectorizer', 'CountVectorizer'], 'Input__answer__vectorizer__max_features': [1500, 2500, 4000], 'Input__answer__vectorizer__env': ['scikit-learn'], 'Classifier__model': ['LogisticRegression'], 'Classifier__C': [0.01, 0.1]}, {'Input__question__vectorizer__model': ['TfidfVectorizer', 'CountVectorizer'], 'Input__question__vectorizer__max_features': [500, 1000, 1500, 2500], 'Input__question__vectorizer__env': ['scikit-learn'], 'Input__answer__vectorizer__model': ['TfidfVectorizer', 'CountVectorizer'], 'Input__answer__vectorizer__max_features': [1500, 2500, 4000], 'Input__answer__vectorizer__env': ['scikit-learn'], 'Classifier__model': ['RandomForestClassifier'], 'Classifier__n_estimators': [100, 200], 'Classifier__max_depth': [5, 10, 15]}, {'Input__question__vectorizer__model': ['TfidfVectorizer', 'CountVectorizer'], 'Input__question__vectorizer__max_features': [500, 1000, 1500, 2500], 'Input__question__vectorizer__env': ['scikit-learn'], 'Input__answer__vectorizer__model': ['TfidfVectorizer', 'CountVectorizer'], 'Input__answer__vectorizer__max_features': [1500, 2500, 4000], 'Input__answer__vectorizer__env': ['scikit-learn'], 'Classifier__model': ['SGDClassifier'], 'Classifier__loss': ['hinge', 'log'], 'Classifier__alpha': [0.0001, 0.001]}, {'Input__answer__vectorizer__model': ['TfidfVectorizer', 'CountVectorizer'], 'Input__answer__vectorizer__max_features': [1500, 2500, 4000], 'Input__answer__vectorizer__env': ['scikit-learn'], 'Classifier__model': ['LogisticRegression'], 'Classifier__C': [0.01, 0.1]}, {'Input__answer__vectorizer__model': ['TfidfVectorizer', 'CountVectorizer'], 'Input__answer__vectorizer__max_features': [1500, 2500, 4000], 'Input__answer__vectorizer__env': ['scikit-learn'], 'Classifier__model': ['RandomForestClassifier'], 'Classifier__n_estimators': [100, 200], 'Classifier__max_depth': [5, 10, 15]}, {'Input__answer__vectorizer__model': ['TfidfVectorizer', 'CountVectorizer'], 'Input__answer__vectorizer__max_features': [1500, 2500, 4000], 'Input__answer__vectorizer__env': ['scikit-learn'], 'Classifier__model': ['SGDClassifier'], 'Classifier__loss': ['hinge', 'log'], 'Classifier__alpha': [0.0001, 0.001]}, {'Input__question__vectorizer__model': ['../../dictionaries\\all-mpnet-base-v2.pkl', '../../dictionaries\\distiluse-base-multilingual-cased-v2.pkl'], 'Input__question__vectorizer__env': ['saved-dictionary'], 'Input__answer__vectorizer__model': ['../../dictionaries\\all-mpnet-base-v2.pkl', '../../dictionaries\\distiluse-base-multilingual-cased-v2.pkl'], 'Input__answer__vectorizer__env': ['saved-dictionary'], 'Classifier__model': ['LogisticRegression'], 'Classifier__C': [0.01, 0.1]}, {'Input__question__vectorizer__model': ['../../dictionaries\\all-mpnet-base-v2.pkl', '../../dictionaries\\distiluse-base-multilingual-cased-v2.pkl'], 'Input__question__vectorizer__env': ['saved-dictionary'], 'Input__answer__vectorizer__model': ['../../dictionaries\\all-mpnet-base-v2.pkl', '../../dictionaries\\distiluse-base-multilingual-cased-v2.pkl'], 'Input__answer__vectorizer__env': ['saved-dictionary'], 'Classifier__model': ['RandomForestClassifier'], 'Classifier__n_estimators': [100, 200], 'Classifier__max_depth': [5, 10, 15]}, {'Input__question__vectorizer__model': ['../../dictionaries\\all-mpnet-base-v2.pkl', '../../dictionaries\\distiluse-base-multilingual-cased-v2.pkl'], 'Input__question__vectorizer__env': ['saved-dictionary'], 'Input__answer__vectorizer__model': ['../../dictionaries\\all-mpnet-base-v2.pkl', '../../dictionaries\\distiluse-base-multilingual-cased-v2.pkl'], 'Input__answer__vectorizer__env': ['saved-dictionary'], 'Classifier__model': ['SGDClassifier'], 'Classifier__loss': ['hinge', 'log'], 'Classifier__alpha': [0.0001, 0.001]}, {'Input__answer__vectorizer__model': ['../../dictionaries\\all-mpnet-base-v2.pkl', '../../dictionaries\\distiluse-base-multilingual-cased-v2.pkl'], 'Input__answer__vectorizer__env': ['saved-dictionary'], 'Classifier__model': ['LogisticRegression'], 'Classifier__C': [0.01, 0.1]}, {'Input__answer__vectorizer__model': ['../../dictionaries\\all-mpnet-base-v2.pkl', '../../dictionaries\\distiluse-base-multilingual-cased-v2.pkl'], 'Input__answer__vectorizer__env': ['saved-dictionary'], 'Classifier__model': ['RandomForestClassifier'], 'Classifier__n_estimators': [100, 200], 'Classifier__max_depth': [5, 10, 15]}, {'Input__answer__vectorizer__model': ['../../dictionaries\\all-mpnet-base-v2.pkl', '../../dictionaries\\distiluse-base-multilingual-cased-v2.pkl'], 'Input__answer__vectorizer__env': ['saved-dictionary'], 'Classifier__model': ['SGDClassifier'], 'Classifier__loss': ['hinge', 'log'], 'Classifier__alpha': [0.0001, 0.001]}]
Model training:#
Cross-validate over hyperparameters and select the best model
fitted_models = {}
for code_var in code_variables:
print(code_var)
### Create X and y
X = train_df[[question_col,answer_col]]
y = train_df[code_var]
iqual_model = iqualnlp.Model()
iqual_model.add_text_features(question_col,answer_col,model='TfidfVectorizer',env='scikit-learn')
iqual_model.add_classifier(name="LogisticRegression")
iqual_model.add_threshold(scoring_metric='f1')
iqual_model.compile()
cv_dict = iqual_model.cross_validate_fit(
X,y, # X: Pandas DataFrame of features, y: Pandas Series of labels
search_parameters=CV_SEARCH_PARAMS, # search_parameters: Dictionary of parameters to use for cross-validation
cv_method='RandomizedSearchCV', # cv_method: Cross-validation method to use, options: GridSearchCV, RandomizedSearchCV
n_iter=10, # n_iter: Only when cv_method='RandomizedSearchCV'
scoring=scoring_dict, # scoring: Scoring metric to use for cross-validation
refit='f1', # refit: Metric to use for refitting the model
n_jobs=-1, # n_jobs: Number of parallel threads to use
cv_splits=3, # cv_splits: Number of cross-validation splits
)
print()
print()
print("Average F1 score for {code_var}: {score:.3f}".format(code_var=code_var,score=cv_dict['avg_test_score']))
# Save fitted model to a dictionary
fitted_models[code_var] = iqual_model
religious
.......720 hyperparameters configurations possible.....
Average F1 score for religious: 0.605
migration
.......720 hyperparameters configurations possible.....
Average F1 score for migration: 0.647
entrepreneur
.......720 hyperparameters configurations possible.....
Average F1 score for entrepreneur: 0.581
secular
.......720 hyperparameters configurations possible.....
Average F1 score for secular: 0.444
marriage
.......720 hyperparameters configurations possible.....
Average F1 score for marriage: 0.780
Evaluate model using out sample data (Held out human-coded data)#
scorer = evaluation.get_metric('f1_score')
for code_var in code_variables:
f1_score = iqual_model.score(test_df[['Q_en','A_en']],
test_df[code_var],
scoring_function=scorer
)
print(f"Out-sample F1-score for {code_var} is : {f1_score:.3f}")
Out-sample F1-score for religious is : 0.073
Out-sample F1-score for migration is : 0.024
Out-sample F1-score for entrepreneur is : 0.032
Out-sample F1-score for secular is : 0.108
Out-sample F1-score for marriage is : 0.846
Predict labels for unannotated data#
for code_var in code_variables:
uncoded_df[code_var+'_pred'] = fitted_models[code_var].predict(uncoded_df[['Q_en','A_en']])
print(f"\tExamples of positive {code_var} predictions:\n")
print('\t===============================================\n\n')
for idx, row in uncoded_df.loc[(uncoded_df[code_var+"_pred"]==1),['Q_en','A_en']].sample(1).iterrows():
print("Q: ",row['Q_en'],"\n","A: ", row['A_en'],sep='')
print()
Examples of positive religious predictions:
===============================================
Q: What kind of job will you do in Madrasah?
A: Madrasa teacher, mosque imam.
Examples of positive migration predictions:
===============================================
Q: Well, what is the dream of your eldest child?
A: What should I do? If the money is money, I will send it abroad.
Examples of positive entrepreneur predictions:
===============================================
Q: You said that you want to study, you said that you want to go abroad, what are you doing to fulfill these dreams?
A: I can't do anything. In the future, I will do something by doing business.
Examples of positive secular predictions:
===============================================
Q: Well, which one will be more happy?
A: I am happy that my son will go to school and educate other boys, it is a happiness to teach other boys, it is a happiness to buy a shop.
Examples of positive marriage predictions:
===============================================
Q: It's your dream ok ok. What other dreams do you have besides becoming a government army officer?
A: What other dreams can there be about a girl? Here, if the village area is a little bigger, they get married and move to the in-laws' house.
Best Parameters for each annotation#
for code_var in code_variables:
best_params = fitted_models[code_var].cv.get_best_params()
print(f"\tBest parameters for {code_var}:\n\n",best_params,end='\n\n')
Best parameters for religious:
{'Input__question__vectorizer__model': 'CountVectorizer', 'Input__question__vectorizer__max_features': 500, 'Input__question__vectorizer__env': 'scikit-learn', 'Input__answer__vectorizer__model': 'CountVectorizer', 'Input__answer__vectorizer__max_features': 4000, 'Input__answer__vectorizer__env': 'scikit-learn', 'Classifier__n_estimators': 200, 'Classifier__model': 'RandomForestClassifier', 'Classifier__max_depth': 15}
Best parameters for migration:
{'Input__answer__vectorizer__model': 'CountVectorizer', 'Input__answer__vectorizer__max_features': 2500, 'Input__answer__vectorizer__env': 'scikit-learn', 'Classifier__model': 'SGDClassifier', 'Classifier__loss': 'hinge', 'Classifier__alpha': 0.001}
Best parameters for entrepreneur:
{'Input__question__vectorizer__model': 'TfidfVectorizer', 'Input__question__vectorizer__max_features': 1500, 'Input__question__vectorizer__env': 'scikit-learn', 'Input__answer__vectorizer__model': 'TfidfVectorizer', 'Input__answer__vectorizer__max_features': 2500, 'Input__answer__vectorizer__env': 'scikit-learn', 'Classifier__model': 'SGDClassifier', 'Classifier__loss': 'hinge', 'Classifier__alpha': 0.0001}
Best parameters for secular:
{'Input__question__vectorizer__model': 'TfidfVectorizer', 'Input__question__vectorizer__max_features': 2500, 'Input__question__vectorizer__env': 'scikit-learn', 'Input__answer__vectorizer__model': 'TfidfVectorizer', 'Input__answer__vectorizer__max_features': 4000, 'Input__answer__vectorizer__env': 'scikit-learn', 'Classifier__n_estimators': 100, 'Classifier__model': 'RandomForestClassifier', 'Classifier__max_depth': 5}
Best parameters for marriage:
{'Input__answer__vectorizer__model': 'CountVectorizer', 'Input__answer__vectorizer__max_features': 1500, 'Input__answer__vectorizer__env': 'scikit-learn', 'Classifier__n_estimators': 100, 'Classifier__model': 'RandomForestClassifier', 'Classifier__max_depth': 15}