Source code for magmaviz.histogram

import altair as alt
import pandas as pd

[docs]def histogram(df, x, y):
    """Plot a histogram with the magma color scheme

    Parameters
    ----------
    df : dataframe
        Dataframe containing the variables for plotting
    x : string
        Column name of the variable to be plotted on the x-axis
    y : string
        An aggregation function to be plotted on the y-axis.
        The supported aggregation operations are: ['average', 'count',
        'distinct', 'max', 'mean', 'median', 'min', 'missing', 'product',
        'q1', 'q3', 'ci0', 'ci1', 'stderr', 'stdev', 'stdevp', 'sum', 
        'valid', 'values', 'variance', 'variancep']

    Returns
    -------
    altair.vegalite.v4.api.Chart
        A histogram displaying distribution based on the aggregation function

    Examples
    --------
    >>> from magmaviz.histogram import histogram
    >>> histogram(mtcars, "cars", "count()")
    """
    # check if the type of df is a pandas DataFrame
    if not isinstance(df, pd.core.frame.DataFrame):
        raise TypeError(
            "'df' should be of type 'pandas.core.frame.DataFrame', a pandas dataframe."
        )
        
    # check if the type of x is a string
    if not isinstance(x, str):
        raise TypeError("'x' should be of type 'str'.")
              
    # check if x is a column in the dataframe
    if x not in list(df.columns):
        raise ValueError("The column specified for 'x' does not exist in the dataframe.")
    
    validate(df, y)
  
    chart = alt.Chart(df).mark_bar().encode(
        x=x,
        y=y,
        color=alt.Color(y, scale=alt.Scale(scheme="magma"))
    )
    return chart

[docs]def validate(df, y):
    """A helper function to validate input y
    Parameters
    ----------
    df : dataframe
        The input dataframe
    y : string
        The user specified aggregation function to be plotted on the y-axis.
        The supported aggregation operations are: ['average', 'count',
        'distinct', 'max', 'mean', 'median', 'min', 'missing', 'product',
        'q1', 'q3', 'ci0', 'ci1', 'stderr', 'stdev', 'stdevp', 'sum', 
        'valid', 'values', 'variance', 'variancep']
    Returns
    -------
        This function only raises errors
    Examples
    --------
    >>> validate(data.iris(), 'mean(petalLength)')
    >>> validate(data.iris(), 'count()')
    """
    supported_operations = ['average', 'count',
        'distinct', 'max', 'mean', 'median', 'min', 'missing', 'product',
        'q1', 'q3', 'ci0', 'ci1', 'stderr', 'stdev', 'stdevp', 'sum', 
        'valid', 'values', 'variance', 'variancep']
    # check if the type of y is a string
    if not isinstance(y, str):
        raise TypeError("'y' should be of type 'str'.")
  
    splited_y = y.split("(")
    function_name = splited_y[0]
    
    if len(splited_y) != 2:
        raise ValueError("'y' is not in a correct format as an aggregation function.")
    
    if function_name not in supported_operations:
        raise ValueError(
            "The aggregation function specified for 'y' " +
            "is not one of " + str(supported_operations))
        
    encoding_field = splited_y[1].split(')')[0]
    
    # For aggregation functions that need an encoding field, check if the input
    #   has one and is a valid column
    if function_name in ['average', 'distinct', 'max', 'mean', 
                         'median', 'min', 'missing', 'product',
                         'q1', 'q3', 'ci0', 'ci1', 'stderr', 'stdev', 
                         'stdevp', 'sum', 'valid', 'values', 'variance',
                         'variancep']:
        if encoding_field == '':
            raise ValueError(
            "The encoding field 'y' is specified without a type; " +
            "the type cannot be inferred because it does not match any column in the data"
            )
        
        if encoding_field not in list(df.columns):
            raise ValueError("The encoding field for 'y' does not exist in the dataframe.")