Source code for simplefit.classifier

import pandas as pd
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline


[docs]def classifier(train_df, target_col, numeric_feats = None, categorical_feats = None, cv = 5):
    """This function preprocess the data, fit baseline model(dummyclassifier) and logistic regression with default setups to provide data scientists 
        easy access to the common models results(scores). 
        Parameters
        ----------
        train_df : pandas.DataFrame
            The clean train data which includes target column.
        target_col : str
            The column of the train data that has the target values.
        numeric_feats = list, optional
            The numeric features that needs to be considered in the model. If the user enters an empty list, the function will use all numeric columns.
        categorical_feats : list, optional
            The categorical features that needs to be considered in the model. 
        cv : int, optional
            The number of folds on the data for train and validation set.
        Returns
        -------
        Data frame
            A data frame that includes test scores and train scores for each model.
        Examples
        -------
        >>> classifier(train_df, target_col = 'target', numerical_feats = [], categorical_features = [])
        >>> classifier(train_df, target_col = 'target', numeric_feats = ['danceability', 'loudness'], categorical_feats=['genre'], cv=10)
    """


    if (not(isinstance(train_df , pd.core.frame.DataFrame))):
        raise TypeError("Invalid function input. Please enter a data frame")
    if (not (train_df.isna().sum().sum() == 0)):
        raise ValueError("Invalid function input. Please pass a clean pandas data frame")
    if not(isinstance(numeric_feats , list)):
        raise TypeError("Numeric Features should be passed as a list")
    if not(isinstance(categorical_feats , list)):
        raise TypeError("Categorical Features should be passed as a list")
    if not(isinstance(target_col , str)):
        raise TypeError("Target column must be passed as a string")

    
    X_train = train_df.drop(columns=target_col, axis=1)
    y_train = train_df[target_col]


    if not isinstance(numeric_feats, list):
        raise TypeError("The numeric features have to be entered as a list")
    if not isinstance(categorical_feats , list):
        raise TypeError("The categorical features have to be entered as a list")
    
    if numeric_feats == None or numeric_feats==[]:
        numeric_feats = train_df.select_dtypes(include='number').columns.tolist()
    if categorical_feats == None or categorical_feats ==[]:
        categorical_feats = train_df.select_dtypes(exclude='number').columns.tolist()


    preprocessor = make_column_transformer(
        (StandardScaler(), numeric_feats),
        (OneHotEncoder(), categorical_feats))

    dummy = DummyClassifier()
    lr = make_pipeline(preprocessor, LogisticRegression())

    results = pd.Series(dtype='float64') 

    models = {"DummyClassifier": dummy, "LogisticRegression" : lr}

    for model in models :
        scores = cross_validate(models[model], X_train, y_train, return_train_score = True,cv = cv)
        mean_scores = pd.DataFrame(scores).mean().to_frame(model)
        results = pd.concat([results, mean_scores], axis = 1)
    results = results.drop(columns = 0, axis=1)
    
    return results