Source code for CovertMark.analytics.learning

from . import constants, entropy
from ..data import utils as data_utils

from abc import ABC
from sklearn import preprocessing, model_selection, linear_model

"""
Different classifier architectures that can be used for traffic classification.
"""

[docs]class Classifier(ABC): """ Abstract class for common classifier methods. """ def __init__(self, classifier): self.__classifier = classifier
[docs] def train(self, training_features, training_labels): """ Train on training input feature rows. :param list training_features: input rows of features for training, without labels. These should be redrawn between training runs. :param list training_labels: corresponding labels for the input rows. """ assert(len(training_features) > 0) assert(len(training_features) == len(training_labels)) self._feature_width = len(training_features[0]) self.__classifier.fit(training_features, training_labels)
[docs] def predict(self, validation_inputs): """ Classify unseen inputs, can be used for both validation prediction and recall prediction. :param list validation_inputs: input rows of features for validation, should not have been seen during training. :returns: array of positive(1) / negative(0) labels predicted. """ assert(len(validation_inputs) > 0) assert(self._feature_width == len(validation_inputs[0])) return self.__classifier.predict(validation_inputs)
[docs]class LogisticRegression(Classifier): """ Generic logistic regression with stable L1 penalisation and fast SAGA solver to converge quickly, but can overfit on high dimensional spaces. """ def __init__(self, multithreaded=True): n_jobs = -1 if multithreaded else 1 self.__classifier = linear_model.LogisticRegression(penalty='l1', dual=False, solver='saga', n_jobs=n_jobs, max_iter=5000, warm_start=False) super().__init__(self.__classifier)
[docs]class SGD(Classifier): """ Stochastic gradient descent linear classification, as a less memory-intensive and incremental learning-compatible alternative to linear SVM (LinearSVC). """ def __init__(self, loss="hinge", multithreaded=True): assert(loss in ["hinge", "modified_huber", "squared_hinge"]) n_jobs = -1 if multithreaded else 1 self.__classifier = linear_model.SGDClassifier(penalty='l1', loss=loss, max_iter=5000, n_jobs=n_jobs, learning_rate='optimal', warm_start=False, class_weight="balanced") super().__init__(self.__classifier)