Source code for chaininglib.search.LexiconQuery

import json
import pandas as pd
import urllib
import requests
import chaininglib.constants as constants
import chaininglib.ui.status as status
import chaininglib.search.lexiconQueries as lexiconQueries

from chaininglib.search.GeneralQuery import GeneralQuery

[docs]class LexiconQuery(GeneralQuery): """ A query on a lexicon. """ def __init__(self, resource, lemma=None, pos=None): super().__init__(resource, pattern=None, lemma=lemma, word=None, pos=pos) def __str__(self): return 'LexiconQuery({0}, {1}, {2})'.format( self._resource, self._lemma, self._pos)
[docs] def search(self): ''' Perform a lexicon search Returns: LexiconQuery object >>> # build a lexicon search query >>> lexicon_obj = create_lexicon(some_lexicon).lemma(some_lemma).search() >>> # get the results as table of kwic's >>> df = lexicon_obj.kwic() ''' if self._resource not in constants.AVAILABLE_LEXICA: raise ValueError("Unknown lexicon: " + self._resource) if self._pattern_given is not None: raise ValueError('In lexicon search, patterns are not allowed. Use lemma and/or a part-of-speech instead.') if self._lemma is None and self._pos is None and self._word is None and self.lemma_id is None: raise ValueError("A lemma (id) and/or a part-of-speech and/or a word is required" + self._word) # Reset self._df_kwic, from previous calls of search() self._df_kwic = pd.DataFrame() # show wait indicator, so the user knows what's happening status.show_wait_indicator('Searching '+self._resource) lexicon_settings = constants.AVAILABLE_LEXICA[self._resource] method = lexicon_settings["method"] if method=="sparql": endpoint = lexicon_settings["sparql_url"] sparql_offset = 0 sparql_limit = 10000 if self._resource == 'diamant' else None non_empty_response = True collection_df = pd.DataFrame() while non_empty_response: # show how far we are (update offset indicator) status.show_wait_indicator('Querying '+self._resource+' at offset '+ str(sparql_offset)) # build query query = lexiconQueries.lexicon_query(self._lemma, self._pos, self._resource, sparql_limit, sparql_offset) try: # Accept header is needed for virtuoso response = requests.post(endpoint, data={"query":query}, headers = {"Accept":"application/sparql-results+json"}) except Exception as e: status.remove_wait_indicator() raise ValueError("An error occured when searching lexicon " + self._resource + ": "+ str(e)) # if response is ill formed, show it for debugging! try: response_json = json.loads(response.text) except: display(response.text) records_json = response_json["results"]["bindings"] records_string = json.dumps(records_json) # _df_kwic is assigned instead of appended, so kwic() can be called multiple times part_df = pd.read_json(records_string, orient="records") # if the response is not empty, append the data to the whole collection of data if part_df.size > 0: collection_df = collection_df.append(part_df) # if the response size equals the max allowed size, we might have to collect more data at the next offset if (sparql_limit is not None and part_df.size == sparql_limit): sparql_offset = sparql_offset + sparql_limit # otherwise, we must have reached the end, so there is no need to query any further else: non_empty_response = False # remove previous offset indicator status.remove_wait_indicator() # _df_kwic is assigned instead of appended, so kwic() can be called multiple times self._df_kwic = collection_df # make sure cells containing NULL are added too, otherwise we'll end up with ill-formed data # CAUSES MALFUNCTION: df = df.fillna('') self._df_kwic = self._df_kwic.applymap(lambda x: '' if pd.isnull(x) else x["value"]) elif method=="lexicon_service": query_url = constants.LEXICON_SERVICE_URL + "&database=" + self._resource query_url = query_url.replace('_QUERY_TYPE_', self._query_type) if 0==1 and self._lemma is None and self._word is None: raise ValueError("For this lexicon, a lemma (id) or a word is necessary!" + self._word) if self._lemma: query_url += "&lemma=" + self._lemma if self._word: query_url += "&wordform=" + self._word if self._pos: query_url += "&pos=" + self._pos if self._lemma_id: query_url += "&lemma_id=" + self._lemma_id try: response = requests.get(query_url, headers = {"Accept":"application/json"}) except Exception as e: status.remove_wait_indicator() raise ValueError("An error occured when searching lexicon " + self._resource + ": "+ str(e)) print ("Query URL: " + query_url) response_json = json.loads(response.text) result_is_lemma = (self._query_type == 'get_lemma_from_wordform') if (result_is_lemma): records_json = response_json["lemmata_list"] elif self._query_type == 'get_related_lemmata': records_json = response_json['found_lemmata_and_relations'] else: records_json = response_json["wordforms_list"] records_string = json.dumps(records_json) # _df_kwic is assigned instead of appended, so kwic() can be called multiple times if (self._query_type == 'get_related_lemmata'): df_query_result = pd.read_json(records_string) self._df_kwic = self._df_kwic.append(df_query_result, ignore_index=True) else: for query_result in records_json: query_result_string = json.dumps(query_result) if (result_is_lemma): lemma_records = query_result['found_lemmata'] lemma_record_string = json.dumps(lemma_records) df_query_result = pd.read_json(lemma_record_string) self._df_kwic = self._df_kwic.append(df_query_result, ignore_index=True) else: df_query_result = pd.read_json(query_result_string) self._df_kwic = self._df_kwic.append(df_query_result, ignore_index=True) self._df_kwic = self._df_kwic.rename(columns={"found_wordforms":"wordform"}).rename(columns={"found_lemmata":"lemma"}) # make sure cells containing NULL are added too, otherwise we'll end up with ill-formed data # CAUSES MALFUNCTION: df = df.fillna('') #self._df_kwic = self._df_kwic.applymap(lambda x: '' if pd.isnull(x) else x["value"]) else: raise ValueError("Unknown lexicon search method: " + method) # remove wait indicator, status.remove_wait_indicator() self._search_performed = True # object enriched with response return self._copyWith('_response', records_string)
# OUTPUT
[docs] def json(self): ''' Get the JSON response (unparsed) of a lexicon search Returns: JSON string >>> # build a lexicon search query >>> lexicon_obj = create_lexicon(some_lexicon).lemma(some_lemma).search() >>> # get the JSON response >>> df = lexicon_obj.json() ''' self.check_search_performed() return self._response
[docs] def kwic(self): ''' Get the keyword in context (KWIC) results (as Pandas DataFrame) of a lexicon search Returns: Pandas DataFrame >>> # build a lexicon search query >>> lexicon_obj = create_lexicon(some_lexicon).lemma(some_lemma).search() >>> # get the results as table of kwic's >>> df = lexicon_obj.kwic() ''' self.check_search_performed() return self._df_kwic
[docs]def create_lexicon(name): ''' API constructor Returns: LexiconQuery object >>> lexicon_obj = create_lexicon(some_lexicon).lemma(some_lemma).search() >>> df = lexicon_obj.kwic() ''' return LexiconQuery(name)
[docs]def get_available_lexica(): ''' This function returns the list of the available lexica Returns: list of lexicon name strings ''' return list(constants.AVAILABLE_LEXICA.keys())