Source code for chaininglib.search.LexiconQuery

import json
import pandas as pd
import urllib
import requests
import chaininglib.constants as constants
import chaininglib.ui.status as status
import chaininglib.search.lexiconQueries as lexiconQueries

from chaininglib.search.GeneralQuery import GeneralQuery

[docs]class LexiconQuery(GeneralQuery):
    """ A query on a lexicon. """

    def __init__(self, resource, lemma=None, pos=None):
        super().__init__(resource, pattern=None, lemma=lemma, word=None, pos=pos)
        

    def __str__(self):
        return 'LexiconQuery({0}, {1}, {2})'.format(
            self._resource, self._lemma, self._pos)

[docs]    def search(self):
        '''
        Perform a lexicon search 

        Returns:
            LexiconQuery object
        
        >>> # build a lexicon search query
        >>> lexicon_obj = create_lexicon(some_lexicon).lemma(some_lemma).search()
        >>> # get the results as table of kwic's
        >>> df = lexicon_obj.kwic()
        '''
        if self._resource not in constants.AVAILABLE_LEXICA:
            raise ValueError("Unknown lexicon: " + self._resource)
        
        if self._pattern_given is not None:
            raise ValueError('In lexicon search, patterns are not allowed. Use lemma and/or a part-of-speech instead.')
            
        if self._lemma is None and self._pos is None and self._word is None and self.lemma_id is None:
            raise ValueError("A lemma (id) and/or a part-of-speech and/or a word is required" + self._word)
            
        # Reset self._df_kwic, from previous calls of search()
        self._df_kwic = pd.DataFrame()
        # show wait indicator, so the user knows what's happening
        status.show_wait_indicator('Searching '+self._resource)

        lexicon_settings = constants.AVAILABLE_LEXICA[self._resource]
        method = lexicon_settings["method"]

        if method=="sparql":
            endpoint = lexicon_settings["sparql_url"]
            
            sparql_offset = 0
            sparql_limit = 10000 if self._resource == 'diamant' else None
            non_empty_response = True
            
            collection_df = pd.DataFrame()
            
            while non_empty_response:
                
                # show how far we are (update offset indicator)
                status.show_wait_indicator('Querying '+self._resource+' at offset '+ str(sparql_offset))

                # build query
                query = lexiconQueries.lexicon_query(self._lemma, self._pos, self._resource, sparql_limit, sparql_offset)
                
                
                try:
                    # Accept header is needed for virtuoso
                    response = requests.post(endpoint, data={"query":query}, headers = {"Accept":"application/sparql-results+json"})
                except Exception as e:
                    status.remove_wait_indicator()
                    raise ValueError("An error occured when searching lexicon " + self._resource + ": "+ str(e))
                
                # if response is ill formed, show it for debugging!
                try:
                    response_json = json.loads(response.text)                
                except:
                    display(response.text)
                    
                records_json = response_json["results"]["bindings"]
                records_string = json.dumps(records_json)

                # _df_kwic is assigned instead of appended, so kwic() can be called multiple times
                part_df = pd.read_json(records_string, orient="records")
                
                # if the response is not empty, append the data to the whole collection of data
                if part_df.size > 0:
                    collection_df = collection_df.append(part_df)
                    
                # if the response size equals the max allowed size, we might have to collect more data at the next offset
                if (sparql_limit is not None and part_df.size == sparql_limit):
                    sparql_offset = sparql_offset + sparql_limit
                # otherwise, we must have reached the end, so there is no need to query any further
                else:
                    non_empty_response = False          
                    
                # remove previous offset indicator
                status.remove_wait_indicator()


            # _df_kwic is assigned instead of appended, so kwic() can be called multiple times
            self._df_kwic = collection_df
            # make sure cells containing NULL are added too, otherwise we'll end up with ill-formed data
            # CAUSES MALFUNCTION: df = df.fillna('')
            self._df_kwic = self._df_kwic.applymap(lambda x: '' if pd.isnull(x) else x["value"])
                
        elif method=="lexicon_service":
            query_url = constants.LEXICON_SERVICE_URL + "&database=" + self._resource
            query_url = query_url.replace('_QUERY_TYPE_', self._query_type)

            if 0==1 and self._lemma is None and  self._word is None:
                raise ValueError("For this lexicon, a lemma (id) or a word is necessary!" + self._word)
            if self._lemma:
              query_url += "&lemma=" + self._lemma
            if self._word:
              query_url += "&wordform=" + self._word
            if self._pos:
                query_url += "&pos=" + self._pos
            if self._lemma_id:
                query_url += "&lemma_id=" + self._lemma_id
            try:
                response = requests.get(query_url, headers = {"Accept":"application/json"})
            except Exception as e:
                status.remove_wait_indicator()
                raise ValueError("An error occured when searching lexicon " + self._resource + ": "+ str(e))

            print ("Query URL: " + query_url)

            response_json = json.loads(response.text)
            
            result_is_lemma = (self._query_type == 'get_lemma_from_wordform')
            
            if (result_is_lemma):
               records_json = response_json["lemmata_list"]
            elif self._query_type == 'get_related_lemmata':
                records_json = response_json['found_lemmata_and_relations']
            else:
               records_json = response_json["wordforms_list"]
            records_string = json.dumps(records_json)
            
            # _df_kwic is assigned instead of appended, so kwic() can be called multiple times
            if (self._query_type == 'get_related_lemmata'):
                df_query_result = pd.read_json(records_string)
                self._df_kwic = self._df_kwic.append(df_query_result, ignore_index=True)
            else:
                for query_result in records_json:
                    query_result_string = json.dumps(query_result)
                    
                    if (result_is_lemma):
                        lemma_records = query_result['found_lemmata']
                        lemma_record_string = json.dumps(lemma_records)
                        df_query_result = pd.read_json(lemma_record_string)
                        self._df_kwic = self._df_kwic.append(df_query_result, ignore_index=True)
                    else:
                        df_query_result = pd.read_json(query_result_string)
                        self._df_kwic = self._df_kwic.append(df_query_result, ignore_index=True)
            self._df_kwic = self._df_kwic.rename(columns={"found_wordforms":"wordform"}).rename(columns={"found_lemmata":"lemma"})
            
            # make sure cells containing NULL are added too, otherwise we'll end up with ill-formed data
            # CAUSES MALFUNCTION: df = df.fillna('')
            #self._df_kwic = self._df_kwic.applymap(lambda x: '' if pd.isnull(x) else x["value"])
        else:
            raise ValueError("Unknown lexicon search method: " + method)
        
        # remove wait indicator, 
        status.remove_wait_indicator()
        
        self._search_performed = True

        # object enriched with response
        return self._copyWith('_response', records_string)
           
    
    

    # OUTPUT    
    
[docs]    def json(self):
        '''
        Get the JSON response (unparsed) of a lexicon search

        Returns:
            JSON string
            
        >>> # build a lexicon search query
        >>> lexicon_obj = create_lexicon(some_lexicon).lemma(some_lemma).search()
        >>> # get the JSON response
        >>> df = lexicon_obj.json()
        '''
        self.check_search_performed()

        return self._response
    
    
[docs]    def kwic(self):
        '''
        Get the keyword in context (KWIC) results (as Pandas DataFrame) of a lexicon search

        Returns:
            Pandas DataFrame
        
        >>> # build a lexicon search query
        >>> lexicon_obj = create_lexicon(some_lexicon).lemma(some_lemma).search()
        >>> # get the results as table of kwic's
        >>> df = lexicon_obj.kwic()
        '''
        
        self.check_search_performed()
        return self._df_kwic
    
    

[docs]def create_lexicon(name):
    '''
    API constructor

    Returns:
        LexiconQuery object
    
    >>> lexicon_obj = create_lexicon(some_lexicon).lemma(some_lemma).search()
    >>> df = lexicon_obj.kwic()
    '''
    return LexiconQuery(name)


[docs]def get_available_lexica():
    '''
    This function returns the list of the available lexica
    
    Returns:
        list of lexicon name strings
    '''
    return list(constants.AVAILABLE_LEXICA.keys())