import pandas as pd
# beware: just like the files in chaininglib.process, this file contains function operating on DataFrames.
# However the functions in this file are general in the sense that they don't specifically aim at manipulating
# corpus or lexicon data.
# On the contrary, the functions in chaininglib.process do aim to manipulate corpus or lexicon data, so those
# function don't belong in this general section.
[docs]def check_valid_df(function_name, obj):
'''
This function is called by others to check if input is a DataFrame, when it is expected!
If the input does not contain a DataFrame, throw an error
Args:
function_name: the name of the function, so as to be able to show where an error occured
obj: the object to be checked
Returns:
N/A
'''
if not (isinstance(obj, pd.DataFrame) or isinstance(obj, pd.Series)):
raise ValueError(function_name+"() requires a Pandas DataFrame as argument. You might have forgotten to use object.kwic().")
[docs]def property_freq(df, column_name):
'''
Count values for a certain property in a results DataFrame, and sort them by frequency
Args:
df: DataFrame with results, one row per found token
column_name: Column name (property) to count
Returns:
a DataFrame of the most values for this property, sorted by frequency.
Column 'token count' contains the number of tokens, column 'perc' gives the percentage.
'''
# classic group by + count, just like in SQL
df = df.groupby(column_name).size()
# the new column with the counts is given the name "token count"
# and we set a new sequential index
df = df.reset_index(name="token count")
# sort by count, with the highest on top
df = df.sort_values("token count", ascending=False)
# set a new sequential index again
# (the drop parameter makes sure the old index is NOT added as a column)
df = df.reset_index(drop=True)
# compute percentage for each
total = df.sum(numeric_only=True, axis=0)
df["perc"] = df["token count"] / total.iloc[0]
return df
[docs]def df_filter(df_column, pattern, method='contains'):
'''
Helper function to build some condition to filter a Pandas DataFrame,
given a column and some value(s) to filter this column with
Args:
df_column: a Pandas DataFrame column to filter on
pattern: string, set or interval list to filter on
method: "contains", "match", isin" or "interval"
Returns:
a condition
>>> words_ending_with_e = df_filter( df_lexicon["wordform"], 'e$' )
>>> df_lexicon_final_e = df_lexicon[ words_ending_with_e ]
'''
if method=="contains":
if not isinstance(pattern,str):
raise ValueError("df_filter 'contains' method needs string as pattern.")
condition = df_column.str.contains(pattern, na=False)
elif method=="match":
if not isinstance(pattern,str):
raise ValueError("df_filter 'match' method needs string as pattern.")
condition = df_column.str.match(pattern, na=False)
elif method=="isin":
if not isinstance(pattern,set):
raise ValueError("df_filter 'isin' method needs set as pattern.")
condition = df_column.isin(pattern)
elif method=="interval":
if not (isinstance(pattern, list) and len(pattern)==2):
raise ValueError("df_filter 'interval' method needs a list consisting of a lower and upper boundary as pattern.")
val_from = pattern[0]
val_to = pattern[1]
if val_from is None and val_to is None:
raise ValueError("Lower boundary or upper boundary of interval should be given.")
col_numeric = df_column.astype('int32')
if val_from:
condition_from = (col_numeric >= int(val_from))
condition = condition_from
if val_to:
condition_to = (col_numeric <= int(val_to))
condition = condition_to
if val_from and val_to:
condition = condition_from & condition_to
else:
raise ValueError("Choose one of 'contains', 'match', 'isin' or 'interval' as method for df_filter.")
return condition
[docs]def join_df(df_arr, join_type=None):
'''
This function joins two dataframes (=concat along axis 1)
Args:
df_arr: array of Pandas DataFrames
join_type: {inner, outer (default)}
Returns:
a single Pandas DataFrame
>>> new_df = join_df( [dataframe1, dataframe2] )
>>> display_df(new_df)
'''
# ref: https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html
if join_type is None:
concat_df = pd.concat( df_arr, axis=1, sort=False )
else:
concat_df = pd.concat( df_arr, axis=1, join=join_type, sort=False )
return concat_df
[docs]def column_difference(df_column1, df_column2):
'''
This function computes differences and similarities between two Pandas DataFrames
Args:
df_column1: a Pandas DataFrame, filtered by one column
df_column2: a Pandas DataFrame, filtered by one column
Returns:
diff_left: array of words only in df_column1
diff_right: array of words only in df_column2
intersec: array of words both in df_column1 and df_column2
>>> diff_left, diff_right, intersec = column_difference(df_corpus1["word 1"], df_corpus2["word 1"])
>>> display( 'These words are only in DataFrame #1 : ' + ", ".join(diff_left) )
>>> display( 'These words are only in DataFrame #2 : ' + ", ".join(diff_right) )
>>> display( 'These words are common to both DataFrame : ' + ", ".join(intersec) )
'''
set_df1 = set(df_column1)
set_df2 = set(df_column2)
diff_left = set_df1.difference(set_df2)
diff_right = set_df2.difference(set_df1)
intersec = set_df1.intersection(set_df2)
return diff_left, diff_right, intersec
[docs]def get_rank_diff(df1, df2, index=None, label1='rank_1', label2='rank_2'):
'''
This function compares the rankings of words common to two dataframes, and compute a rank_diff, in such
a way that one can see which words are very frequent in one set and rare in the other.
Args:
df1: a Pandas DataFrame provided with rankings stored in a column "rank" (see example)
df2: a Pandas DataFrame provided with rankings stored in a column "rank" (see example)
index (Optional): name of the column to be used as index (usually: the lemmata column)
label1 (Optional): output column name for the ranks of the items of df1
label2 (Optional): output column name for the ranks of the items of df2
Returns:
a Pandas DataFrame with lemmata (index), ranks of both input dataframes (label1 and label2)
and the rank_diff ('rank_diff' column).
>>> df_frequency_list1 = get_frequency_list(corpus_to_search1)
>>> df_frequency_list2 = get_frequency_list(corpus_to_search2)
>>> df_rankdiffs = get_rank_diff(df_frequency_list1, df_frequency_list2)
'''
check_valid_df("get_rank_diff", df1)
check_valid_df("get_rank_diff", df2)
if index is not None:
# https://stackoverflow.com/questions/42196337/dataframe-set-index-not-setting
df1 = df1.set_index(index, drop=True)
df2 = df2.set_index(index, drop=True)
# Find lemmata shared by both dataframes: computing ranks diffs is only possible
# when dealing with lemmata which are in both frames
lemmata_list1 = set(df1.index.tolist())
lemmata_list2 = set(df2.index.tolist())
common_lemmata_list = list( lemmata_list1.intersection(lemmata_list2) )
# Build dataframes limited to the common lemmata
limited_df1 = df1.loc[ common_lemmata_list , : ]
limited_df2 = df2.loc[ common_lemmata_list , : ]
# Recompute ranks in both dataframes, because in each frame the original ranks were
# computed with a lemmata list which might be larger than the lemmata list common
# to both dataframes
limited_df1['rank'] = limited_df1['token count'].rank(ascending = False).astype(int)
limited_df2['rank'] = limited_df2['token count'].rank(ascending = False).astype(int)
# Instantiate a dataframe for storing lemmata and rank diffs
df_rankdiffs = pd.DataFrame(index=common_lemmata_list, columns=[label1, label2, 'rank_diff'])
df_rankdiffs.index.name = 'lemmata'
df_rankdiffs[label1] = limited_df1['rank']
df_rankdiffs[label2] = limited_df2['rank']
df_rankdiffs['rank_diff'] = pd.DataFrame.abs( df_rankdiffs[label1] - df_rankdiffs[label2] )
return df_rankdiffs
[docs]def get_relfreq_diff(df1, df2, index=None, label1='relfreq_1', label2='relfreq_2', operation="division", N=1):
'''
This function compares the rankings of words common to two dataframes, and compute a rank_diff, in such
a way that one can see which words are very frequent in one set and rare in the other.
Args:
df1: a Pandas DataFrame provided with relative frequency stored in a column "perc" (see example)
df2: a Pandas DataFrame provided with relative frequency stored in a column "perc" (see example)
index (Optional): name of the column to be used as index (usually: the lemmata column)
label1 (Optional): output column name for the relative frequency of the items of df1
label2 (Optional): output column name for the relative frequency of the items of df2
operation (optional): 'division' for dividing relative frequencies by eachother, 'subtraction' for subtracting relative frequencies from eachother. Default 'division'
N (optional): smoothing parameter when operation is 'division'. Default 1.
Returns:
a Pandas DataFrame with lemmata (index), ranks of both input dataframes ('rank_1' and 'rank_2' columns)
and the rank_diff ('rank_diff' column).
>>> df_frequency_list1 = get_frequency_list(corpus_to_search1)
>>> df_frequency_list2 = get_frequency_list(corpus_to_search2)
>>> df_rankdiffs = get_rank_diff(df_frequency_list1, df_frequency_list2)
'''
check_valid_df("get_rank_diff", df1)
check_valid_df("get_rank_diff", df2)
if index is not None:
# https://stackoverflow.com/questions/42196337/dataframe-set-index-not-setting
df1 = df1.set_index(index, drop=True)
df2 = df2.set_index(index, drop=True)
# Find lemmata shared by both dataframes: computing diffs is only possible
# when dealing with lemmata which are in both frames
lemmata_list1 = set(df1.index.tolist())
lemmata_list2 = set(df2.index.tolist())
union_lemmata_list = list( lemmata_list1.union(lemmata_list2) )
difference_list = list( lemmata_list1.symmetric_difference(lemmata_list2) )
# Build dataframes limited to the common lemmata
limited_df1 = df1.reindex(union_lemmata_list)
limited_df2 = df2.reindex(union_lemmata_list)
# Recompute ranks in both dataframes, because in each frame the original ranks were
# computed with a lemmata list which might be larger than the lemmata list common
# to both dataframes
#limited_df1['rank'] = limited_df1['token count'].rank(ascending = False).astype(int)
#limited_df2['rank'] = limited_df2['token count'].rank(ascending = False).astype(int)
# Instantiate a dataframe for storing lemmata and rank diffs
df_relfreq_diffs = pd.DataFrame(index=union_lemmata_list, columns=[label1, label2, 'relfreq_diff'])
df_relfreq_diffs.index.name = 'lemmata'
df_relfreq_diffs[label1] = limited_df1['perc']
df_relfreq_diffs[label2] = limited_df2['perc']
df_relfreq_diffs = df_relfreq_diffs.fillna(0)
if operation == "division":
df_relfreq_diffs['relfreq_diff'] = (df_relfreq_diffs[label1] + N) / (df_relfreq_diffs[label2] +N)
elif operation=="subtraction":
df_relfreq_diffs['relfreq_diff'] = df_relfreq_diffs[label1] - df_relfreq_diffs[label2]
return df_relfreq_diffs