Source code for src.interface.emailclassifierapp

from src.classifiers.classifier import Classifier
from src.data_module.dataloader import DataLoader
from src.exceptions.exceptions import (
    PathError,
    ClassifierOptionError,
    ModelNotFound,
    InputDataError,
    VotingOptionForVotingClassifierError,
    EstimatorOptionError,
    VotingClassifierNotSupported,
    StackingClassifierNotSupported,
)
from src.models.model import Model



[docs]
class EmailClassifierApp:
    """Class that is the interface for EmailClassifier"""

    def __init__(self):
        """Constructor initializes data_loader and classifier. Model is None at this point."""
        self.data_loader = DataLoader()
        self.classifier = Classifier()
        self.model1 = None
        self.model2 = None
        self.model3 = None
        self.multiclassifier_model = None
        self.CLASSIFIERS = {
            "MultinomialNB": self.classifier.set_clf_nb,
            "LogisticRegression": self.classifier.set_clf_lr,
            "ComplementNB": self.classifier.set_clf_cnb,
            "BernoulliNB": self.classifier.set_clf_bnb,
            "SGDClassifier": self.classifier.set_clf_sgd,
            "RidgeClassifier": self.classifier.set_clf_rdg,
            "RandomForestClassifier": self.classifier.set_clf_rfc,
            "GradientBoostingClassifier": self.classifier.set_clf_gbc,
            "AdaBoostClassifier": self.classifier.set_clf_abc,
            "LinearSVC": self.classifier.set_clf_lsv,
            "SVC": self.classifier.set_clf_svc,
            "KNeighborsClassifier": self.classifier.set_clf_knn,
            "VotingClassifier": self.classifier.set_clf_vtc,
            "StackingClassifier": self.classifier.set_clf_stc,
            "DecisionTreeClassifier": self.classifier.set_clf_dtc,
            "ExtraTreeClassifier": self.classifier.set_clf_etc,
        }


[docs]
    def set_model1_clf(self, clf):
        """Method sets model object with provided classifier and returns it."""
        self.model1 = Model(clf)
        return self.model1



[docs]
    def set_model2_clf(self, clf):
        """Method sets model object with provided classifier and returns it."""
        self.model2 = Model(clf)
        return self.model2



[docs]
    def set_model3_clf(self, clf):
        """Method sets model object with provided classifier and returns it."""
        self.model3 = Model(clf)
        return self.model3



[docs]
    def set_multiclassifier_model_clf(self, clf):
        """Method sets model object with provided classifier and returns it."""
        self.multiclassifier_model = Model(clf)
        return self.multiclassifier_model



[docs]
    def load_data_csv(self, path):
        """Method loads data from csv file. If it fails, an exception is raised."""
        try:
            self.data_loader.set_path(path)
            df = self.data_loader.load_data_csv()
            return df
        except PathError as e:
            print(e.get_message())
            print("Error code: " + e.get_code())
            return None



[docs]
    def classifier_option_check(self, option, constant):
        """Method that checks if provided classifier option is valid."""
        return False if option not in constant.keys() else True



[docs]
    def train_3_stage_pipelines(
        self,
        path1="../data/training_emails_stage_1.csv",
        path2="../data/training_emails_stage_2.csv",
        path3="../data/training_emails_stage_3.csv",
        classifier_option_1="MultinomialNB",
        classifier_option_2="MultinomialNB",
        classifier_option_3="MultinomialNB",
        column_name_1="related_to_jobhunt",
        column_name_2="is_confirmation",
        column_name_3="is_invitation",
        column_name_main="email_text",
    ):
        """Method trains 3 pipelines with the usage of data given in argument. Default option is the data from 'data' folder"""

        def train_pipeline(path, classifier_option, setup, column_name_train):
            """Helper function for training pipeline."""
            try:
                df = self.load_data_csv(path)
                if not self.classifier_option_check(
                    classifier_option, self.CLASSIFIERS
                ):
                    raise ClassifierOptionError
                else:
                    try:
                        if classifier_option == "VotingClassifier":
                            raise VotingClassifierNotSupported
                        else:
                            try:
                                if classifier_option == "StackingClassifier":
                                    raise StackingClassifierNotSupported
                            except StackingClassifierNotSupported as e:
                                print(e.get_message())
                                print("Error code: " + e.get_code())
                                print("Classifier reset to default MultinomialNB")
                                classifier_option = "MultinomialNB"
                    except VotingClassifierNotSupported as e:
                        print(e.get_message())
                        print("Error code: " + e.get_code())
                        print("Classifier reset to default MultinomialNB")
                        classifier_option = "MultinomialNB"
                    finally:
                        # uses the key of dictionary and calls the corresponding method
                        self.CLASSIFIERS[classifier_option]()
            except ClassifierOptionError as e:
                print(e.get_message())
                print("Error code: " + e.get_code())
                self.model1 = None
                self.model2 = None
                self.model3 = None
                return None
            else:
                model = setup(self.classifier.get_classifier())
                model.build_pipeline()
                model.set_X(df[column_name_main])
                model.set_y(df[column_name_train])
                model.train()

        train_pipeline(path1, classifier_option_1, self.set_model1_clf, column_name_1)
        train_pipeline(path2, classifier_option_2, self.set_model2_clf, column_name_2)
        train_pipeline(path3, classifier_option_3, self.set_model3_clf, column_name_3)



[docs]
    def view_3_stage_pipelines_accuracy(self):
        """Method displays the accuracy of the 3 pipelines."""
        try:
            if self.model1 is None or self.model2 is None or self.model3 is None:
                raise ModelNotFound()
        except ModelNotFound as e:
            print(e.get_message())
            print("Error code: " + e.get_code())
            return None
        else:

            def view(stage, model):
                """Helper function for viewing 3 stage pipelines accuracy."""
                print(stage + " stage accuracy: ")
                accuracy = model.count_accuracy()
                print(accuracy[0])
                print(accuracy[1])

            stages = ["1st", "2nd", "3rd"]
            models = [self.model1, self.model2, self.model3]

            for s, m in zip(stages, models):
                view(s, m)



[docs]
    def classify_emails_3_stage_pipelines(self, emails):
        """Classify one or multiple emails through 3-stage pipelines."""
        try:
            if self.model1 is None or self.model2 is None or self.model3 is None:
                raise ModelNotFound()
        except ModelNotFound as e:
            print(e.get_message())
            print("Error code: " + e.get_code())
            return None
        else:

            if isinstance(emails, str):
                emails = [emails]
            try:
                if not isinstance(emails, list):
                    raise InputDataError()

                for email in emails:
                    if not isinstance(email, str):
                        raise InputDataError()
            except InputDataError as e:
                print(e.get_message())
                print("Error code: " + e.get_code())
                return None

            results = []

            for i, text in enumerate(emails):
                # --- Stage 1: Is it jobhunt-related? ---
                prediction_stage_1 = self.model1.pipeline.predict([text])[0]
                if not bool(prediction_stage_1):
                    results.append(
                        {"email_index": i, "classification": "Not job-hunt related"}
                    )
                    continue

                # --- Stage 2: Confirmation or next step? ---
                prediction_stage2 = self.model2.pipeline.predict([text])[0]
                if bool(prediction_stage2):
                    results.append({"email_index": i, "classification": "Confirmation"})
                    continue

                # --- Stage 3: Invitation or Rejection? ---
                prediction_stage3 = self.model3.pipeline.predict([text])[0]
                classification = (
                    "Invitation" if bool(prediction_stage3) else "Rejection"
                )

                results.append({"email_index": i, "classification": classification})

            return results



[docs]
    def train_multiclassifier_pipeline(
        self,
        path="../data/training_emails_multiclassifier.csv",
        classifier_option="MultinomialNB",
        column_name_train="email_type",
        column_name_main="email_text",
        estimator_1=None,
        estimator_2=None,
        estimator_3=None,
        voting_option="hard",
    ):
        """Method trains a pipeline that utilizes multiclassification."""
        try:
            df = self.load_data_csv(path)
            if not self.classifier_option_check(classifier_option, self.CLASSIFIERS):
                raise ClassifierOptionError
            else:
                # uses the key of dictionary and calls the corresponding method
                self.CLASSIFIERS[classifier_option]()

                if classifier_option == "VotingClassifier":
                    self.set_voting_classifier_parameters(
                        estimator_1, estimator_2, estimator_3, voting_option
                    )

                if classifier_option == "StackingClassifier":
                    self.set_stacking_classifier_estimators(
                        estimator_1, estimator_2, estimator_3
                    )

        except ClassifierOptionError as e:
            print(e.get_message())
            print("Error code: " + e.get_code())
            self.multiclassifier_model = None
            return None
        else:
            model = self.set_multiclassifier_model_clf(self.classifier.get_classifier())
            model.build_pipeline()
            model.set_X(df[column_name_main])
            model.set_y(df[column_name_train])
            model.train()



[docs]
    def view_multiclassifier_accuracy(self):
        """Method displays the accuracy of the multiclassifier."""
        try:
            if self.multiclassifier_model is None:
                raise ModelNotFound()
        except ModelNotFound as e:
            print(e.get_message())
            print("Error code: " + e.get_code())
            return None
        else:
            print("Multiclassifier accuracy: ")
            accuracy = self.multiclassifier_model.count_accuracy()
            print(accuracy[0])
            print(accuracy[1])



[docs]
    def predict_with_multiclassifier(self, emails):
        """Method predicts emails' type with multiclassifier."""

        try:
            if self.multiclassifier_model is None:
                raise ModelNotFound()
        except ModelNotFound as e:
            print(e.get_message())
            print("Error code: " + e.get_code())
            return None
        else:

            if isinstance(emails, str):
                emails = [emails]
            try:
                if not isinstance(emails, list):
                    raise InputDataError()

                for email in emails:
                    if not isinstance(email, str):
                        raise InputDataError()
            except InputDataError as e:
                print(e.get_message())
                print("Error code: " + e.get_code())
                return None

            results = []

            for i, text in enumerate(emails):
                prediction_stage_1 = self.multiclassifier_model.pipeline.predict(
                    [text]
                )[0]
                results.append(
                    {"email_index": i, "classification": str(prediction_stage_1)}
                )

            return results



[docs]
    def set_voting_classifier_parameters(
        self, estimator_1, estimator_2, estimator_3, voting_option
    ):
        """Method allows user to set estimators and voting option for VotingClassifier."""

        if estimator_1 is None and estimator_2 is None and estimator_3 is None:
            return None

        try:
            if voting_option == "hard":
                self.classifier.set_voting_hard()
            elif voting_option == "soft":
                self.classifier.set_voting_soft()
            else:
                raise VotingOptionForVotingClassifierError
        except VotingOptionForVotingClassifierError as e:
            print(e.get_message())
            print("Error code: " + e.get_code())
        else:
            try:
                if (
                    not self.classifier_option_check(
                        estimator_1, self.classifier.ESTIMATORS_AND_CLASSIFIERS
                    )
                    or not self.classifier_option_check(
                        estimator_2, self.classifier.ESTIMATORS_AND_CLASSIFIERS
                    )
                    or not self.classifier_option_check(
                        estimator_3, self.classifier.ESTIMATORS_AND_CLASSIFIERS
                    )
                ):
                    raise EstimatorOptionError
            except EstimatorOptionError as e:
                print(e.get_message())
                print("Error code: " + e.get_code())
            else:
                self.classifier.set_vc_clf_1(
                    self.classifier.ESTIMATORS_AND_CLASSIFIERS[estimator_1]()
                )
                self.classifier.set_vc_clf_2(
                    self.classifier.ESTIMATORS_AND_CLASSIFIERS[estimator_2]()
                )
                self.classifier.set_vc_clf_3(
                    self.classifier.ESTIMATORS_AND_CLASSIFIERS[estimator_3]()
                )
                self.classifier.set_clf_vtc()



[docs]
    def set_stacking_classifier_estimators(self, estimator_1, estimator_2, estimator_3):
        """Method allows user to set estimators for StackingClassifier."""

        if estimator_1 is None and estimator_2 is None and estimator_3 is None:
            return None

        try:
            if (
                not self.classifier_option_check(
                    estimator_1, self.classifier.ESTIMATORS_AND_CLASSIFIERS
                )
                or not self.classifier_option_check(
                    estimator_2, self.classifier.ESTIMATORS_AND_CLASSIFIERS
                )
                or not self.classifier_option_check(
                    estimator_3, self.classifier.ESTIMATORS_AND_CLASSIFIERS
                )
            ):
                raise EstimatorOptionError
        except EstimatorOptionError as e:
            print(e.get_message())
            print("Error code: " + e.get_code())
        else:
            self.classifier.set_sc_clf_1(
                self.classifier.ESTIMATORS_AND_CLASSIFIERS[estimator_1]()
            )
            self.classifier.set_sc_clf_2(
                self.classifier.ESTIMATORS_AND_CLASSIFIERS[estimator_2]()
            )
            self.classifier.set_sc_clf_3(
                self.classifier.ESTIMATORS_AND_CLASSIFIERS[estimator_3]()
            )
            self.classifier.set_clf_stc()