Source code for chaininglib.utils.dfops

import pandas as pd


# beware: just like the files in chaininglib.process, this file contains function operating on DataFrames.
# However the functions in this file are general in the sense that they don't specifically aim at manipulating
# corpus or lexicon data.  
# On the contrary, the functions in chaininglib.process do aim to manipulate corpus or lexicon data, so those
# function don't belong in this general section.


[docs]def check_valid_df(function_name, obj):
    '''
    This function is called by others to check if input is a DataFrame, when it is expected!
    If the input does not contain a DataFrame, throw an error

    Args:
        function_name: the name of the function, so as to be able to show where an error occured
        obj: the object to be checked

    Returns:
        N/A
    '''
    if not (isinstance(obj, pd.DataFrame) or isinstance(obj, pd.Series)):
        raise ValueError(function_name+"() requires a Pandas DataFrame as argument. You might have forgotten to use object.kwic().")

        
        
[docs]def property_freq(df, column_name):
    '''
    Count values for a certain property in a results DataFrame, and sort them by frequency

    Args:
        df: DataFrame with results, one row per found token
        column_name: Column name (property) to count

    Returns:
        a DataFrame of the most values for this property, sorted by frequency. 
        Column 'token count' contains the number of tokens, column 'perc' gives the percentage.
    '''
    
    # classic group by + count, just like in SQL
    df = df.groupby(column_name).size()
    
    # the new column with the counts is given the name "token count"
    # and we set a new sequential index 
    df = df.reset_index(name="token count")
    
    # sort by count, with the highest on top
    df = df.sort_values("token count", ascending=False)
    
    # set a new sequential index again
    # (the drop parameter makes sure the old index is NOT added as a column)
    df = df.reset_index(drop=True)
    
    # compute percentage for each 
    total = df.sum(numeric_only=True, axis=0)
    df["perc"] = df["token count"] / total.iloc[0]
    return df




[docs]def df_filter(df_column, pattern, method='contains'):    
    '''
    Helper function to build some condition to filter a Pandas DataFrame, 
    given a column and some value(s) to filter this column with
    
    Args:
        df_column: a Pandas DataFrame column to filter on
        pattern: string, set or interval list to filter on
        method: "contains", "match", isin" or "interval"

    Returns:
        a condition
        
    >>> words_ending_with_e = df_filter( df_lexicon["wordform"], 'e$' )
    >>> df_lexicon_final_e = df_lexicon[ words_ending_with_e ]
    '''
    
    if method=="contains":
        if not isinstance(pattern,str):
            raise ValueError("df_filter 'contains' method needs string as pattern.")
        condition = df_column.str.contains(pattern, na=False)    
    elif method=="match":
        if not isinstance(pattern,str):
            raise ValueError("df_filter 'match' method needs string as pattern.")
        condition = df_column.str.match(pattern, na=False)  
    elif method=="isin":
        if not isinstance(pattern,set):
            raise ValueError("df_filter 'isin' method needs set as pattern.")
        condition = df_column.isin(pattern)
    elif method=="interval":
        if not (isinstance(pattern, list) and len(pattern)==2):
            raise ValueError("df_filter 'interval' method needs a list consisting of a lower and upper boundary as pattern.")
        val_from = pattern[0]
        val_to = pattern[1]
        if val_from is None and val_to is None:
            raise ValueError("Lower boundary or upper boundary of interval should be given.")
        col_numeric = df_column.astype('int32')
        if val_from:
            condition_from = (col_numeric >= int(val_from))
            condition = condition_from
        if val_to:
            condition_to = (col_numeric <= int(val_to))
            condition = condition_to
        if val_from and val_to:
            condition = condition_from & condition_to
    else:
        raise ValueError("Choose one of 'contains', 'match', 'isin' or 'interval' as method for df_filter.")
        
    return condition


[docs]def join_df(df_arr, join_type=None):    
    '''
    This function joins two dataframes (=concat along axis 1) 

    Args:
        df_arr: array of Pandas DataFrames
        join_type: {inner, outer (default)}

    Returns:
        a single Pandas DataFrame 
        
    >>> new_df = join_df( [dataframe1, dataframe2] )
    >>> display_df(new_df)
    '''
    
    # ref: https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html
    
    if join_type is None:
        concat_df = pd.concat( df_arr, axis=1, sort=False )
    else:
        concat_df = pd.concat( df_arr, axis=1, join=join_type, sort=False )
    
    return concat_df



[docs]def column_difference(df_column1, df_column2):    
    '''
    This function computes differences and similarities between two Pandas DataFrames

    Args:
        df_column1: a Pandas DataFrame, filtered by one column
        df_column2: a Pandas DataFrame, filtered by one column

    Returns:
        diff_left: array of words only in df_column1
        diff_right: array of words only in df_column2
        intersec: array of words both in df_column1 and df_column2
        
    >>> diff_left, diff_right, intersec = column_difference(df_corpus1["word 1"], df_corpus2["word 1"])
    >>> display( 'These words are only in DataFrame #1 : ' + ", ".join(diff_left) )
    >>> display( 'These words are only in DataFrame #2 : ' + ", ".join(diff_right) )
    >>> display( 'These words are common to both DataFrame : ' + ", ".join(intersec) )
    '''
    
    set_df1 = set(df_column1)
    set_df2 = set(df_column2)
    diff_left = set_df1.difference(set_df2)
    diff_right = set_df2.difference(set_df1)
    intersec = set_df1.intersection(set_df2)
    return diff_left, diff_right, intersec
   

    
[docs]def get_rank_diff(df1, df2, index=None, label1='rank_1', label2='rank_2'):    
    '''
    This function compares the rankings of words common to two dataframes, and compute a rank_diff, in such
    a way that one can see which words are very frequent in one set and rare in the other.
    
    Args:
        df1: a Pandas DataFrame provided with rankings stored in a column "rank" (see example)
        df2: a Pandas DataFrame provided with rankings stored in a column "rank" (see example)
        index (Optional): name of the column to be used as index (usually: the lemmata column)
        label1 (Optional): output column name for the ranks of the items of df1
        label2 (Optional): output column name for the ranks of the items of df2
        
    Returns:
        a Pandas DataFrame with lemmata (index), ranks of both input dataframes (label1 and label2)
        and the rank_diff ('rank_diff' column).
        
    >>> df_frequency_list1 = get_frequency_list(corpus_to_search1)
    >>> df_frequency_list2 = get_frequency_list(corpus_to_search2)
    >>> df_rankdiffs = get_rank_diff(df_frequency_list1, df_frequency_list2)
    '''
    
    check_valid_df("get_rank_diff", df1)
    check_valid_df("get_rank_diff", df2)
    
    if index is not None:
        # https://stackoverflow.com/questions/42196337/dataframe-set-index-not-setting
        df1 = df1.set_index(index, drop=True)
        df2 = df2.set_index(index, drop=True)
        
    # Find lemmata shared by both dataframes: computing ranks diffs is only possible
    # when dealing with lemmata which are in both frames
    lemmata_list1 = set(df1.index.tolist())
    lemmata_list2 = set(df2.index.tolist())
    common_lemmata_list = list( lemmata_list1.intersection(lemmata_list2) )
    
    # Build dataframes limited to the common lemmata
    limited_df1 = df1.loc[ common_lemmata_list , : ]
    limited_df2 = df2.loc[ common_lemmata_list , : ]
    
    # Recompute ranks in both dataframes, because in each frame the original ranks were
    # computed with a lemmata list which might be larger than the lemmata list common
    # to both dataframes
    
    limited_df1['rank'] = limited_df1['token count'].rank(ascending = False).astype(int)
    limited_df2['rank'] = limited_df2['token count'].rank(ascending = False).astype(int)
    
    # Instantiate a dataframe for storing lemmata and rank diffs
    df_rankdiffs = pd.DataFrame(index=common_lemmata_list, columns=[label1, label2, 'rank_diff'])
    df_rankdiffs.index.name = 'lemmata'
    
    df_rankdiffs[label1] = limited_df1['rank']
    df_rankdiffs[label2] = limited_df2['rank']
    df_rankdiffs['rank_diff'] = pd.DataFrame.abs( df_rankdiffs[label1] - df_rankdiffs[label2] )
    
    return df_rankdiffs

[docs]def get_relfreq_diff(df1, df2, index=None, label1='relfreq_1', label2='relfreq_2', operation="division", N=1):    
    '''
    This function compares the rankings of words common to two dataframes, and compute a rank_diff, in such
    a way that one can see which words are very frequent in one set and rare in the other.
    
    Args:
        df1: a Pandas DataFrame provided with relative frequency stored in a column "perc" (see example)
        df2: a Pandas DataFrame provided with relative frequency stored in a column "perc" (see example)
        index (Optional): name of the column to be used as index (usually: the lemmata column)
        label1 (Optional): output column name for the relative frequency of the items of df1
        label2 (Optional): output column name for the relative frequency of the items of df2
        operation (optional): 'division' for dividing relative frequencies by eachother, 'subtraction' for subtracting relative frequencies from eachother. Default 'division'
        N (optional): smoothing parameter when operation is 'division'. Default 1.
        
    Returns:
        a Pandas DataFrame with lemmata (index), ranks of both input dataframes ('rank_1' and 'rank_2' columns) 
        and the rank_diff ('rank_diff' column).
        
    >>> df_frequency_list1 = get_frequency_list(corpus_to_search1)
    >>> df_frequency_list2 = get_frequency_list(corpus_to_search2)
    >>> df_rankdiffs = get_rank_diff(df_frequency_list1, df_frequency_list2)
    '''
    
    check_valid_df("get_rank_diff", df1)
    check_valid_df("get_rank_diff", df2)
    
    if index is not None:
        # https://stackoverflow.com/questions/42196337/dataframe-set-index-not-setting
        df1 = df1.set_index(index, drop=True)
        df2 = df2.set_index(index, drop=True)
        
    # Find lemmata shared by both dataframes: computing diffs is only possible
    # when dealing with lemmata which are in both frames
    lemmata_list1 = set(df1.index.tolist())
    lemmata_list2 = set(df2.index.tolist())
    union_lemmata_list = list( lemmata_list1.union(lemmata_list2) )
    difference_list = list( lemmata_list1.symmetric_difference(lemmata_list2) )
    # Build dataframes limited to the common lemmata
    limited_df1 = df1.reindex(union_lemmata_list)
    limited_df2 = df2.reindex(union_lemmata_list)
    
    # Recompute ranks in both dataframes, because in each frame the original ranks were
    # computed with a lemmata list which might be larger than the lemmata list common
    # to both dataframes
    
    #limited_df1['rank'] = limited_df1['token count'].rank(ascending = False).astype(int)
    #limited_df2['rank'] = limited_df2['token count'].rank(ascending = False).astype(int)
    
    # Instantiate a dataframe for storing lemmata and rank diffs
    df_relfreq_diffs = pd.DataFrame(index=union_lemmata_list, columns=[label1, label2, 'relfreq_diff'])
    df_relfreq_diffs.index.name = 'lemmata'
    
    df_relfreq_diffs[label1] = limited_df1['perc']
    df_relfreq_diffs[label2] = limited_df2['perc']
    df_relfreq_diffs = df_relfreq_diffs.fillna(0)
    if operation == "division":
        df_relfreq_diffs['relfreq_diff'] = (df_relfreq_diffs[label1] + N) / (df_relfreq_diffs[label2] +N)
    elif operation=="subtraction":
        df_relfreq_diffs['relfreq_diff'] = df_relfreq_diffs[label1] - df_relfreq_diffs[label2]
    
    return df_relfreq_diffs