Source code for src.interface.emailclassifierapp

from src.classifiers.classifier import Classifier
from src.data_module.dataloader import DataLoader
from src.exceptions.exceptions import (
    PathError,
    ClassifierOptionError,
    ModelNotFound,
    InputDataError,
    VotingOptionForVotingClassifierError,
    EstimatorOptionError,
    VotingClassifierNotSupported,
    StackingClassifierNotSupported,
)
from src.models.model import Model


[docs] class EmailClassifierApp: """Class that is the interface for EmailClassifier""" def __init__(self): """Constructor initializes data_loader and classifier. Model is None at this point.""" self.data_loader = DataLoader() self.classifier = Classifier() self.model1 = None self.model2 = None self.model3 = None self.multiclassifier_model = None self.CLASSIFIERS = { "MultinomialNB": self.classifier.set_clf_nb, "LogisticRegression": self.classifier.set_clf_lr, "ComplementNB": self.classifier.set_clf_cnb, "BernoulliNB": self.classifier.set_clf_bnb, "SGDClassifier": self.classifier.set_clf_sgd, "RidgeClassifier": self.classifier.set_clf_rdg, "RandomForestClassifier": self.classifier.set_clf_rfc, "GradientBoostingClassifier": self.classifier.set_clf_gbc, "AdaBoostClassifier": self.classifier.set_clf_abc, "LinearSVC": self.classifier.set_clf_lsv, "SVC": self.classifier.set_clf_svc, "KNeighborsClassifier": self.classifier.set_clf_knn, "VotingClassifier": self.classifier.set_clf_vtc, "StackingClassifier": self.classifier.set_clf_stc, "DecisionTreeClassifier": self.classifier.set_clf_dtc, "ExtraTreeClassifier": self.classifier.set_clf_etc, }
[docs] def set_model1_clf(self, clf): """Method sets model object with provided classifier and returns it.""" self.model1 = Model(clf) return self.model1
[docs] def set_model2_clf(self, clf): """Method sets model object with provided classifier and returns it.""" self.model2 = Model(clf) return self.model2
[docs] def set_model3_clf(self, clf): """Method sets model object with provided classifier and returns it.""" self.model3 = Model(clf) return self.model3
[docs] def set_multiclassifier_model_clf(self, clf): """Method sets model object with provided classifier and returns it.""" self.multiclassifier_model = Model(clf) return self.multiclassifier_model
[docs] def load_data_csv(self, path): """Method loads data from csv file. If it fails, an exception is raised.""" try: self.data_loader.set_path(path) df = self.data_loader.load_data_csv() return df except PathError as e: print(e.get_message()) print("Error code: " + e.get_code()) return None
[docs] def classifier_option_check(self, option, constant): """Method that checks if provided classifier option is valid.""" return False if option not in constant.keys() else True
[docs] def train_3_stage_pipelines( self, path1="../data/training_emails_stage_1.csv", path2="../data/training_emails_stage_2.csv", path3="../data/training_emails_stage_3.csv", classifier_option_1="MultinomialNB", classifier_option_2="MultinomialNB", classifier_option_3="MultinomialNB", column_name_1="related_to_jobhunt", column_name_2="is_confirmation", column_name_3="is_invitation", column_name_main="email_text", ): """Method trains 3 pipelines with the usage of data given in argument. Default option is the data from 'data' folder""" def train_pipeline(path, classifier_option, setup, column_name_train): """Helper function for training pipeline.""" try: df = self.load_data_csv(path) if not self.classifier_option_check( classifier_option, self.CLASSIFIERS ): raise ClassifierOptionError else: try: if classifier_option == "VotingClassifier": raise VotingClassifierNotSupported else: try: if classifier_option == "StackingClassifier": raise StackingClassifierNotSupported except StackingClassifierNotSupported as e: print(e.get_message()) print("Error code: " + e.get_code()) print("Classifier reset to default MultinomialNB") classifier_option = "MultinomialNB" except VotingClassifierNotSupported as e: print(e.get_message()) print("Error code: " + e.get_code()) print("Classifier reset to default MultinomialNB") classifier_option = "MultinomialNB" finally: # uses the key of dictionary and calls the corresponding method self.CLASSIFIERS[classifier_option]() except ClassifierOptionError as e: print(e.get_message()) print("Error code: " + e.get_code()) self.model1 = None self.model2 = None self.model3 = None return None else: model = setup(self.classifier.get_classifier()) model.build_pipeline() model.set_X(df[column_name_main]) model.set_y(df[column_name_train]) model.train() train_pipeline(path1, classifier_option_1, self.set_model1_clf, column_name_1) train_pipeline(path2, classifier_option_2, self.set_model2_clf, column_name_2) train_pipeline(path3, classifier_option_3, self.set_model3_clf, column_name_3)
[docs] def view_3_stage_pipelines_accuracy(self): """Method displays the accuracy of the 3 pipelines.""" try: if self.model1 is None or self.model2 is None or self.model3 is None: raise ModelNotFound() except ModelNotFound as e: print(e.get_message()) print("Error code: " + e.get_code()) return None else: def view(stage, model): """Helper function for viewing 3 stage pipelines accuracy.""" print(stage + " stage accuracy: ") accuracy = model.count_accuracy() print(accuracy[0]) print(accuracy[1]) stages = ["1st", "2nd", "3rd"] models = [self.model1, self.model2, self.model3] for s, m in zip(stages, models): view(s, m)
[docs] def classify_emails_3_stage_pipelines(self, emails): """Classify one or multiple emails through 3-stage pipelines.""" try: if self.model1 is None or self.model2 is None or self.model3 is None: raise ModelNotFound() except ModelNotFound as e: print(e.get_message()) print("Error code: " + e.get_code()) return None else: if isinstance(emails, str): emails = [emails] try: if not isinstance(emails, list): raise InputDataError() for email in emails: if not isinstance(email, str): raise InputDataError() except InputDataError as e: print(e.get_message()) print("Error code: " + e.get_code()) return None results = [] for i, text in enumerate(emails): # --- Stage 1: Is it jobhunt-related? --- prediction_stage_1 = self.model1.pipeline.predict([text])[0] if not bool(prediction_stage_1): results.append( {"email_index": i, "classification": "Not job-hunt related"} ) continue # --- Stage 2: Confirmation or next step? --- prediction_stage2 = self.model2.pipeline.predict([text])[0] if bool(prediction_stage2): results.append({"email_index": i, "classification": "Confirmation"}) continue # --- Stage 3: Invitation or Rejection? --- prediction_stage3 = self.model3.pipeline.predict([text])[0] classification = ( "Invitation" if bool(prediction_stage3) else "Rejection" ) results.append({"email_index": i, "classification": classification}) return results
[docs] def train_multiclassifier_pipeline( self, path="../data/training_emails_multiclassifier.csv", classifier_option="MultinomialNB", column_name_train="email_type", column_name_main="email_text", estimator_1=None, estimator_2=None, estimator_3=None, voting_option="hard", ): """Method trains a pipeline that utilizes multiclassification.""" try: df = self.load_data_csv(path) if not self.classifier_option_check(classifier_option, self.CLASSIFIERS): raise ClassifierOptionError else: # uses the key of dictionary and calls the corresponding method self.CLASSIFIERS[classifier_option]() if classifier_option == "VotingClassifier": self.set_voting_classifier_parameters( estimator_1, estimator_2, estimator_3, voting_option ) if classifier_option == "StackingClassifier": self.set_stacking_classifier_estimators( estimator_1, estimator_2, estimator_3 ) except ClassifierOptionError as e: print(e.get_message()) print("Error code: " + e.get_code()) self.multiclassifier_model = None return None else: model = self.set_multiclassifier_model_clf(self.classifier.get_classifier()) model.build_pipeline() model.set_X(df[column_name_main]) model.set_y(df[column_name_train]) model.train()
[docs] def view_multiclassifier_accuracy(self): """Method displays the accuracy of the multiclassifier.""" try: if self.multiclassifier_model is None: raise ModelNotFound() except ModelNotFound as e: print(e.get_message()) print("Error code: " + e.get_code()) return None else: print("Multiclassifier accuracy: ") accuracy = self.multiclassifier_model.count_accuracy() print(accuracy[0]) print(accuracy[1])
[docs] def predict_with_multiclassifier(self, emails): """Method predicts emails' type with multiclassifier.""" try: if self.multiclassifier_model is None: raise ModelNotFound() except ModelNotFound as e: print(e.get_message()) print("Error code: " + e.get_code()) return None else: if isinstance(emails, str): emails = [emails] try: if not isinstance(emails, list): raise InputDataError() for email in emails: if not isinstance(email, str): raise InputDataError() except InputDataError as e: print(e.get_message()) print("Error code: " + e.get_code()) return None results = [] for i, text in enumerate(emails): prediction_stage_1 = self.multiclassifier_model.pipeline.predict( [text] )[0] results.append( {"email_index": i, "classification": str(prediction_stage_1)} ) return results
[docs] def set_voting_classifier_parameters( self, estimator_1, estimator_2, estimator_3, voting_option ): """Method allows user to set estimators and voting option for VotingClassifier.""" if estimator_1 is None and estimator_2 is None and estimator_3 is None: return None try: if voting_option == "hard": self.classifier.set_voting_hard() elif voting_option == "soft": self.classifier.set_voting_soft() else: raise VotingOptionForVotingClassifierError except VotingOptionForVotingClassifierError as e: print(e.get_message()) print("Error code: " + e.get_code()) else: try: if ( not self.classifier_option_check( estimator_1, self.classifier.ESTIMATORS_AND_CLASSIFIERS ) or not self.classifier_option_check( estimator_2, self.classifier.ESTIMATORS_AND_CLASSIFIERS ) or not self.classifier_option_check( estimator_3, self.classifier.ESTIMATORS_AND_CLASSIFIERS ) ): raise EstimatorOptionError except EstimatorOptionError as e: print(e.get_message()) print("Error code: " + e.get_code()) else: self.classifier.set_vc_clf_1( self.classifier.ESTIMATORS_AND_CLASSIFIERS[estimator_1]() ) self.classifier.set_vc_clf_2( self.classifier.ESTIMATORS_AND_CLASSIFIERS[estimator_2]() ) self.classifier.set_vc_clf_3( self.classifier.ESTIMATORS_AND_CLASSIFIERS[estimator_3]() ) self.classifier.set_clf_vtc()
[docs] def set_stacking_classifier_estimators(self, estimator_1, estimator_2, estimator_3): """Method allows user to set estimators for StackingClassifier.""" if estimator_1 is None and estimator_2 is None and estimator_3 is None: return None try: if ( not self.classifier_option_check( estimator_1, self.classifier.ESTIMATORS_AND_CLASSIFIERS ) or not self.classifier_option_check( estimator_2, self.classifier.ESTIMATORS_AND_CLASSIFIERS ) or not self.classifier_option_check( estimator_3, self.classifier.ESTIMATORS_AND_CLASSIFIERS ) ): raise EstimatorOptionError except EstimatorOptionError as e: print(e.get_message()) print("Error code: " + e.get_code()) else: self.classifier.set_sc_clf_1( self.classifier.ESTIMATORS_AND_CLASSIFIERS[estimator_1]() ) self.classifier.set_sc_clf_2( self.classifier.ESTIMATORS_AND_CLASSIFIERS[estimator_2]() ) self.classifier.set_sc_clf_3( self.classifier.ESTIMATORS_AND_CLASSIFIERS[estimator_3]() ) self.classifier.set_clf_stc()