Source code for lazy_text_classifiers.model_selection

#!/usr/bin/env python

from __future__ import annotations

import random
import time
from typing import TYPE_CHECKING, Any, Collection

import numpy as np
import pandas as pd
import torch
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    precision_recall_fscore_support,
)

from .constants import MODEL_NAME_WRAPPER_LUT, ModelNames

if TYPE_CHECKING:
    from sklearn.pipeline import Pipeline

    from .model_wrappers.estimator_base import EstimatorBase

###############################################################################


[docs] class LazyTextClassifiers: # TODO: optuna?? def __init__( self: "LazyTextClassifiers", verbose: int = 1, ignore_warnings: bool = True, random_state: int | None = None, ) -> None: """ Class for managing fitting all classifiers. Parameters ---------- verbose: int Logging verbosity level. Levels: 0 - no logging; 1 - log basic progress; 2 - log everything; Default: 1 (log basic progress) ignore_warnings: bool Should all warnings be ignores. random_state: int | None A seed to initialize random state. Default: None (no pre-set random seed) """ self.verbose = verbose # Set up warnings if ignore_warnings: import warnings warnings.filterwarnings("ignore") # Handle seed if random_state: torch.manual_seed(random_state) random.seed(random_state) np.random.seed(random_state)
[docs] def fit( self: "LazyTextClassifiers", x_train: Collection[str], x_test: Collection[str], y_train: Collection[str], y_test: Collection[str], model_kwargs: dict[str, Any] | None = None, ) -> pd.DataFrame: """ Fit all models with the provided data. Parameters ---------- x_train: Collection[str] The training data, an iterable object where each item is a string. x_test: Collection[str] The testing data, an iterable object where each item is a string. y_train: Collection[str] The training labels, an iterable object where each item is a class. y_test: Collection[str] The testing labels, an iterable object where each item is a class. model_kwargs: dict[str, Any] | None Any specific model kwargs to pass through. Default None (use default parameters and settings for all models) Returns ------- pd.DataFrame The results and metrics returned from fitting all models. """ # Handle kwargs if model_kwargs is None: model_kwargs = {} # Remove setfit model if data is larger than 200 if len(x_train) > 200: MODEL_NAME_WRAPPER_LUT.pop(ModelNames.setfit_transformer) # Iterate model fitting and preds self.fit_models: dict[str, "EstimatorBase" | "Pipeline"] = {} results_rows: list[dict[str, str | float]] = [] for model_name, model_wrapper in MODEL_NAME_WRAPPER_LUT.items(): # Start perf time start_time = time.perf_counter() # Instantiate estimate with any extra kwargs if self.verbose > 0: print(f"Initializing model: '{model_name}'") estimator = model_wrapper( **model_kwargs.get(model_name, {}), verbose=self.verbose > 1, ) # Run the fit method passing in x_train and y_train if self.verbose > 0: print(f"Fitting model: '{model_name}'") estimator = estimator.fit(x_train, y_train) self.fit_models[model_name] = estimator # Record training time duration = time.perf_counter() - start_time # Run the eval if self.verbose > 0: print(f"Testing model: '{model_name}'") # Handle pd.Series if isinstance(x_test, pd.Series): x_test = x_test.tolist() preds = estimator.predict(x_test) # Calc metrics acc = accuracy_score( y_test, preds, ) bal_acc = balanced_accuracy_score( y_test, preds, ) pre, rec, f1, _ = precision_recall_fscore_support( y_test, preds, average="weighted", ) # Combine and log if desired this_model_result = { "model": model_name, "accuracy": acc, "balanced_accuracy": bal_acc, "precision": pre, "recall": rec, "f1": f1, "time": duration, } if self.verbose > 0: print(f"'{model_name}' eval results: {this_model_result}") results_rows.append(this_model_result) # Create dataframe of results self.results_df = ( pd.DataFrame( results_rows, ) .sort_values(by="f1", ascending=False) .reset_index(drop=True) ) return self.results_df