Source code for simplefit.eda

import pandas as pd
import altair as alt


import os
alt.renderers.enable('notebook')
alt.renderers.enable('mimetype')


[docs]def plot_distributions(data, bins = 40, dist_cols=None, class_label=None): """This function creates numerical distribution plots on either all the numeric columns or the ones provided to it Parameters ---------- data : pandas.DataFrame The dataframe for which distribution plot has to be created bins : int The number of bins for histogram plot dist_cols : list, optional The subset of numeric columns for which the histogram plots have to be generated class_label : str, optional The name of the target column only in case of classification dataset. For regression dataset, it is not required Returns ------- chart_numeric The Altair object for the plot Examples ------- >>> plot_distributions(data) >>> plot_distributions(data, dist_cols=['loudness', 'acousticness'], class_label='target') """ if data is None: raise ValueError("Required arg 'data' cannot be empty") if not isinstance(data, pd.DataFrame): raise TypeError("Please enter data of type pd.DataFrame") if not isinstance(bins, int): raise TypeError("bins should be of type int") if dist_cols is not None: if not isinstance(dist_cols, list): raise TypeError("The entered dist_cols should be of 'list' type") else: numeric_features = dist_cols else: numeric_features = list(data.select_dtypes('number').columns) if class_label is None: chart_numeric = alt.Chart(data).mark_bar().encode( alt.X(alt.repeat(), type='quantitative', bin=alt.Bin(maxbins=bins)), y=alt.Y('count()') ).properties( width=300, height=200 ).repeat( numeric_features, columns = 4 ) else: if not type(class_label) == str: raise TypeError("`class_label` should be of string type") else: chart_numeric = alt.Chart(data).mark_bar(opacity=0.6).encode( alt.X(alt.repeat(), type='quantitative', bin=alt.Bin(maxbins=bins)), y=alt.Y('count()', stack=False), color=class_label+':N' ).properties( width=300, height=200 ).repeat( numeric_features, columns = 4 ) return chart_numeric
[docs]def plot_corr(data, corr='spearman'): """This function creates correlation plot for all the columns in the dataframe Parameters ---------- data : pandas.DataFrame The dataframe for which distribution plot has to be created corr : str The correlation method, which can be among 'spearman', 'kendall' or 'pearson' The default value is spearman Returns ------- corr_plot The Altair object for the plot Examples ------- >>> plot_corr(data) >>> plot_corr(data, corr='kendall') """ if data is None : raise ValueError("Required arg 'data' cannot be empty") if not isinstance(data, pd.DataFrame): raise TypeError("Please enter data of type pd.DataFrame") if corr == '': raise ValueError("'corr' cannot be empty") if corr not in ['spearman', 'pearson', 'kendall']: raise ValueError("corr should be one of these: 'spearman', 'pearson', 'kendall'") corr_df= data.corr(corr).stack().reset_index(name='corr') corr_plot = alt.Chart(corr_df, title='Correlation Plot among all features and target').mark_rect().encode( x=alt.X('level_0', title='All features'), y=alt.Y('level_1', title='All features'), tooltip='corr', color=alt.Color('corr', scale=alt.Scale(domain=(-1, 1), scheme='purpleorange'))) return corr_plot
[docs]def plot_splom(data, pair_cols=None): """This function creates SPLOM plot for all the numeric columns in the dataframe or the ones passed by the user Parameters ---------- data : pandas.DataFrame The dataframe for which distribution plot has to be created pair_cols : list The list of dataframe columns, for which correlation plot is to be generated Returns ------- splom_chart The Altair object for the plot Examples ------- >>> plot_splom(data) >>> plot_splom(data, pair_cols=['loudness', 'acousticness', 'energy']) """ if data is None : raise ValueError("Required arg 'data' cannot be empty") if not isinstance(data, pd.DataFrame): raise TypeError("Please enter data of type pd.DataFrame") if pair_cols==[]: raise ValueError("pair_cols should not be empty list") if pair_cols is None: pair_cols = list(data.select_dtypes('number').columns) else: if not isinstance(pair_cols, list): raise TypeError("The entered pair_cols should be of 'list' type") splom_chart = alt.Chart(data).mark_point(opacity=0.3, size=10).encode( alt.X(alt.repeat('row'), type='quantitative'), alt.Y(alt.repeat('column'), type='quantitative') ).properties( width=200, height=200 ).repeat( column=pair_cols, row=pair_cols ) return splom_chart