Source code for chaininglib.process.corpus

from nltk.tag.perceptron import PerceptronTagger
from chaininglib.utils.dfops import property_freq, df_filter
import pandas as pd
import chaininglib.ui.status as status
import re

# beware: just like chaininglib.utils.dfops, this file contains function operating on DataFrames.
# However the functions in this file aim to manipulate DataFrames with corpus data, 
# whereas the functions in dfops are more general

[docs]def get_frequency_list(df_corpus, column_name="lemma"):
    '''
    This function computes the raw frequency of lemmata in a DataFrame containing corpus data

    Args:
        df_corpus: a Pandas DataFrame with corpus data (it must contain at least one 'lemma' column)
        column_name: the column name (default 'lemma') containing the items of which we are computing frequencies
    
    Returns:
        a Pandas DataFrame with 'lemmata' as index, 'token count' a number of occurences per lemma, 
        and 'rank' as ordinal position in the list of lemmata, based on the 'token count'.
    
    >>> df_corpus = create_corpus("gysseling").lemma("boef").search().kwic()
    >>> df_freq_list = get_frequency_list(df_corpus)

    '''
    status.show_wait_indicator('Building frequency list')
    
    # get a list of the columns named 'lemma...' 
    all_col_names = list(df_corpus.columns.values)
    lemma_col_names = [x for x in set(all_col_names) if str(x).startswith(column_name)]
    
    if len(lemma_col_names) == 0:
        raise ValueError("function get_frequency_list() was called with a DataFrame which doesn't contain any '%s' column. If needed, rename the relevant column of your DataFrame into '%s'." % (column_name, column_name))
    
    # instantiate a DataFrame with one single column 'lemmata',
    # in which we will gather all single lemmata occurences
    df_lemmata_list = pd.DataFrame()
    
    # loop through the list of lemma-column:
    # For each of them, gather all unique lemmata and add those to the df_lemmata_list DataFrame
    for col_name in lemma_col_names:
        # rename the column in question to 'lemmata', so as to be able to merge this DataFrame with the full list of lemmata
        sub_df_corpus = df_corpus[col_name]
        df_lemmata_list = pd.concat( [df_lemmata_list, sub_df_corpus] )
        
    column_name_plural = column_name + "s"
    df_lemmata_list.columns=[column_name_plural]
        
    # Use the property_freq to compute a frequency list
    df_frequency_list = property_freq(df_lemmata_list, column_name_plural)    
    # set the lemmata column to be the index
    df_frequency_list.set_index(column_name_plural)
    
    # final step: compute ranks
    # this is needed to be able to compare different frequency lists 
    # with each other (which we could achieve by computing a rank diff)
    df_frequency_list['rank'] = df_frequency_list['token count'].rank(ascending = False).astype(int)
    
    status.remove_wait_indicator()
    
    return df_frequency_list



[docs]def extract_lexicon(dfs_corpus, lemmaColumnName='lemma', posColumnName='pos', wordformColumnName='word'):
    '''
    This method creates a lexicon from a list of corpus search results. Lemma, POS and word column names from the corpus results are also used for the resulting lexicon
    
    Args:
        dfs_corpus: list of Pandas DataFrames with search results from different corpora
        lemmaColumnName: (default 'lemma') column name for lemma in dfs_corpus
        posColumnName: (default 'pos') column name for part-of-speech in dfs_corpus
        wordformColumnName: (default 'word') column name for word form in dfs_corpus

    Returns:
        a Pandas DataFrame representing a lexicon, with lemmaColumnName, posColumnName and wordformColumnName as columns

    >>> dfs_corpus = [df_results_corpus1, df_results_corpus2]
    >>> lexicon = extract_lexicon(dfs_corpus, lemmaColumnName='lemma', posColumnName='pos', wordformColumnName='word')

    '''
    print("Extracting lexicon...")
    
    # Instantiate a DataFrame 
    # in which we will gather the paradigms
    df_lexicon = pd.DataFrame()
    
    # The algorithm expects a list of DataFrames by default, so make sure we have just that
    if isinstance(dfs_corpus, pd.DataFrame):
        dfs_corpus = [dfs_corpus]
        
    for df_corpus in dfs_corpus:
        # Exract the basic layers (lemma, pos, wordform) contained in df_corpus
        column_names = list(df_corpus.columns.values)
        
        # for n, val in enumerate(column_names):
        #     # remove the numbers at the end of the layers names (lemma 1, lemma 2, ..., pos 1, pos 2, ...)
        #     # so we end up with clean layers name only
        #     column_names[n] = val.split(' ')[0] 
        
        # print("Column names after number removal: " + str(column_names))
        
        lemma_columns = list(filter(lambda name: name.startswith(lemmaColumnName+' '), column_names))
        pos_columns = list(filter(lambda name: name.startswith(posColumnName+' '), column_names))
        wordform_columns = list(filter(lambda name: name.startswith(wordformColumnName+' '), column_names))

        if not (
            len(lemma_columns)> 0 and
            len(lemma_columns)==len(pos_columns)==len(wordform_columns)
        ):
            print("Skipping corpus. extract_lexicon() expects the Pandas DataFrame input to contain at least these columns: "+lemmaColumnName+", "+posColumnName+" and "+wordformColumnName)
            continue


        # To be able to extract a lexicon, we need at least: lemma, pos, wordform
        # (only lemma and wordform is dangerous, since there can be homonyms with different grammatical categories,
        #  so when grouping them, we would end up with mixed up paradigms)
        # if (lemmaColumnName not in set(column_names) or posColumnName not in set(column_names) or wordformColumnName not in set(column_names)):
        #     print("Skipping corpus. extract_lexicon() expects the Pandas DataFrame input to contain at least these columns: "+lemmaColumnName+", "+posColumnName+" and "+wordformColumnName)
        #     continue


        # loop through the layers, extract those as temporary DataFrame, 
        # and concat each temporary DataFrame with the main DataFrame to get a full list
        for i in range(0, len(lemma_columns)):
            current_lemma = lemma_columns[i]
            current_pos = pos_columns[i]
            current_wordform = wordform_columns[i]
            sub_df_corpus = df_corpus.loc[ : , [current_lemma, current_pos, current_wordform] ]
            sub_df_corpus.columns = [lemmaColumnName, posColumnName, wordformColumnName]

            df_lexicon = pd.concat( [df_lexicon, sub_df_corpus] )

    # set column names
    df_lexicon.columns = [lemmaColumnName, posColumnName, wordformColumnName]
    
    # get rid on illformed lemmata and set it all lowercase
    
    df_lexicon = df_lexicon[ df_lexicon[lemmaColumnName].apply(lambda x: type(x)==str) ]    
    df_lexicon[lemmaColumnName] = df_lexicon[lemmaColumnName].apply(lambda x: x.lower())
    
    df_lexicon = df_lexicon[ df_lexicon[wordformColumnName].apply(lambda x: type(x)==str) ]
    df_lexicon[wordformColumnName] = df_lexicon[wordformColumnName].apply(lambda x: x.lower()) 
    
    df_lexicon = df_lexicon[ df_lexicon[lemmaColumnName].str.contains("^[a-z]+$" ) ]
    
    # make sure each lemma-pos-wordform combination is unique
    df_lexicon = df_lexicon.drop_duplicates()
    df_lexicon = df_lexicon.sort_values(by=[lemmaColumnName, posColumnName])
    df_lexicon = df_lexicon.reset_index(drop=True)
    return df_lexicon



[docs]def get_tagger(dfs_corpus, word_key="word", pos_key="universal_dependency"):
    '''
    This function instantiates a tagger trained with some corpus annotations (out of a DataFrame)

    Args:
        dfs_corpus: one (or a list of) Pandas DataFrame(s) with annotated corpus data
        word_key: (default 'word') column name for wordforms in dfs_corpus
        pos_key: (default 'universal_dependency') column name for parts-of-speech in dfs_corpus
    
    Returns:
        a PerceptronTagger instance 
    
    >>> # get a tagger, trained with df_corpus: a Pandas DataFrame with lots of corpus data
    >>> tagger = get_tagger(df_corpus)  
    >>> # tag a sentence now
    >>> sentence = 'Here is some beautiful sentence'
    >>> tagged_sentence = tagger.tag( sentence.split() )
    >>> print(tagged_sentence) 
    
    '''
    
    sentences = []
    
    # The algorithm expects a list of DataFrames by default, so make sure we have just that
    if isinstance(dfs_corpus, pd.DataFrame):
        dfs_corpus = [dfs_corpus]
        
    for df_corpus in dfs_corpus:
    
        # The corpus DataFrame consists of a number of sentences (rows) with a fixed number of tokens.
        # Each token has a fixed number of layers holding info like: lemma, wordform or part-of-speech. 
        # As a result, the number of columns of each row = [number of tokens] x [number of layers]

        # To be able to feed the tagger correctly, we need to compute the number of layers,
        # so we can infer the number of tokens the sentences hold. This is because
        # the tagger expects us to feed it with arrays with length = [number of tokens], as elements of
        # one single array holding all sentences arrays (see below).

        # So, determine how many layers (lemma, pos, wordform) we have 
        column_names = list(df_corpus.columns.values)
        for n, val in enumerate(column_names):
            # remove the numbers at the end of the layers names (lemma 1, lemma 2, ..., pos 1, pos 2, ...)
            # so we end up with clean layers name only
            column_names[n] = val.split(' ')[0] 
        number_of_layers = len(set(column_names))

        # Now we can determine the standard length of our corpus sentences: that can be computed 
        # by dividing the number of columns of the corpus DataFrame by the number of layers
        # we just computed.

        nr_of_words_per_sentence = int( df_corpus.shape[1] / number_of_layers )  

        # Build training data for the tagger in the right format
        # The input must be like: [ [('today','NN'),('is','VBZ'),('good','JJ'),('day','NN')], [...] ]
        for index, row in df_corpus.iterrows():
            one_sentence =  []
            wrong = False
            for i in range(0, nr_of_words_per_sentence, 1):
                word_idx = word_key+' '+str(i)
                pos_idx = pos_key+' '+str(i)
                try:
                    tuple = ( row[word_idx], _cut_off_features(row[pos_idx]) )
                    one_sentence.append( tuple )
                    if (row[word_idx] is None or row[pos_idx] is None):
                        wrong = True
                except:
                    raise ValueError("function get_tagger() expects corpus data with columns '%s' and '%s', but those columns could not be found. Please call the function with these extra paramters to declare which column your corpus data has instead: get_tagger(word_key='...', pos_key='...')." % (word_key, pos_key))
            if wrong is False:
                sentences.append(one_sentence)
                
    # Instantiate and train the tagger now
    tagger = PerceptronTagger(load=False)
    tagger.train(sentences)
    
    return tagger


def _cut_off_features(pos_with_features):
    '''
    This function cuts off features from tags with features attached
    '''
    return re.sub('^([A-Z-]+)(|\\(.+\\))$', r'\1', pos_with_features)