Source code for chaininglib.search.TreebankQuery

import copy
import requests
import json
import urllib
import chaininglib.constants as constants
from chaininglib.search.treebankParse import _parse_treebank_xml
import chaininglib.ui.status as status
from BaseXClient import BaseXClient
import pandas as pd
import chaininglib.search.treebankQueries as treebankQueries

from chaininglib.search.GeneralQuery import GeneralQuery

[docs]class TreebankQuery(GeneralQuery): """ A query on a treebank. """ def __init__(self, resource): super().__init__(resource) def __str__(self): return 'TreebankQuery({0}, {1}, {2})'.format( self._resource, self._pattern_given, self._response)
[docs] def search(self): ''' Perform a treebank search Returns: TreebankQuery object >>> # build a treebank search query >>> treebank_obj = create_treebank(some_treebank).pattern(some_pattern).search() ''' if self._pattern_given: if self._lemma or self._word or self._pos: raise ValueError('When a pattern (%s) is given, lemma (%s), word (%s) and/or pos (%s) cannot be supplied too. Redundant!' % (self._pattern_given, self._lemma, self._word, self._pos)) else: # Use pattern supplied by user self._pattern = copy.copy(self._pattern_given) else: # Pattern will be built with lemma, word, pos if self._lemma or self._word or self._pos: self._pattern = treebankQueries.treebank_query(self._lemma, self._word, self._pos) print(self._pattern) else: # If nothing is given: complain raise ValueError('A pattern OR a lemma/word/pos is required') # show wait indicator, so the user knows what's happening status.show_wait_indicator('Searching treebanks') treebanks_settings = constants.AVAILABLE_TREEBANKS[self._resource] endpoint = treebanks_settings["treebanks_url"] method = treebanks_settings["method"] if method=="xml": try: # create session username = treebanks_settings["user"] password = treebanks_settings["pass"] port = treebanks_settings["port"] session = BaseXClient.Session(endpoint, port, username, password) # perform command and returned xml response session.execute("open CGN_ID") pattern_to_send = self._pattern if self._pattern.startswith("xquery") else self._pattern+"xquery " response = session.execute(pattern_to_send) # close session session.close() # remove wait indicator, status.remove_wait_indicator() self._search_performed = True # object enriched with response return self._copyWith('_response', response) except Exception as e: status.remove_wait_indicator() raise ValueError("An error occured when searching the treebank : "+ str(e)) elif method=="gretel": # first we need to get the components-ids of the treebank we'd like to query try: url = endpoint+"/configured_treebanks" components_response = requests.get(url) response_json = json.loads(components_response.text) components_data = response_json[self._resource]["components"] # gather components names which are NOT disabled components_names = list() for comp_key in components_data: if "disabled" in components_data[comp_key] and components_data[comp_key]["disabled"] is True: continue components_names.append(comp_key) except Exception as e: status.remove_wait_indicator() raise ValueError("An error occured when reading the treebank components : "+ str(e)) # send the pattern in a post-request try: url = endpoint+"/results" data_arr ={"already": None, "remainingComponents":components_names, "remainingDatabases":None, "corpus":self._resource, "isAnalysis":False, "iteration":0, "needRegularGrinded":False, "retrieveContext":False, "searchLimit":None, "variables":[], "xpath":self._pattern} headers_arr = {'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8'} response = requests.post( url, data=json.dumps(data_arr), headers=headers_arr ) try: if response is None: status.remove_wait_indicator() raise ValueError("The treebanks json response was empty (None)") else: json_response = response.json() except Exception as e: status.remove_wait_indicator() raise ValueError("An error occured when reading the treebanks json response : "+ str(e)) # now extract the xml content out of the json response xmllist = json_response['xmllist'] xmlstr = '' # build xml response string for node_key in xmllist: node_value = xmllist[node_key] xmlstr += node_value # remove wait indicator, status.remove_wait_indicator() self._search_performed = True # object enriched with response return self._copyWith('_response', xmlstr) except Exception as e: status.remove_wait_indicator() raise ValueError("An error occured when searching the treebank : "+ str(e))
# OUTPUT
[docs] def xml(self): ''' Get the XML response (unparsed) of a treebank search Returns: XML string >>> treebank_obj = create_treebank(some_treebank).pattern(some_pattern).search() >>> xml_response = treebank_obj.xml() ''' self.check_search_performed() return self._response
[docs] def kwic(self, align_lemma=None, align_pos=None, align_wordform=None): ''' Get the results (as Pandas DataFrame) of a treebank search, with one keyword in context (KWIC) per row Beware: if no align_lemma etc is given, there will be no alignment Args: align_lemma: the lemma to align all results with (it then behaves as a hit) align_pos: same with pos align_wordform: same with wordform Returns: Pandas DataFrame >>> treebank_obj = create_treebank(some_treebank).pattern(some_pattern).search() >>> df = treebank_obj.kwic() ''' self.check_search_performed() df = pd.DataFrame() for one_tree in self.trees(): # get the layers layers = one_tree.toLayers() nr_of_tokens = len(layers) if align_lemma == None and align_pos == None and align_wordform == None: # layers need to get into a 1-dimention array concatenated_layers = [] for one in layers: concatenated_layers = concatenated_layers + one columns_lst = [] for i in range(0, nr_of_tokens, 1): columns_lst = columns_lst + ['lemma '+str(i), 'pos '+str(i), 'wordform '+str(i)] #print(columns_lst) #print(concatenated_layers) df_subtree = pd.DataFrame([concatenated_layers], columns=columns_lst) df = pd.concat( [df, df_subtree], sort=False, ignore_index=True ) else: # layers need to get into a 1-dimention array # we will gather a left context, followed by the hit in 3 layers (lemma, pos, wordform), and finally a right context concatenated_layers = [] hit_found = False # left context left_context = "" left_context_sep = "" # hit hit_lemma = "" hit_pos = "" hit_wordform = "" # right context right_context = "" right_context_sep = "" for one in layers: current_lemma = one[0] current_pos = one[1] current_wordform = one[2] # build right context if hit_found == True: right_context = right_context + right_context_sep + current_wordform right_context_sep = " " # find hit if ((align_pos == None or align_pos == current_pos) and (align_lemma == None or align_lemma == current_lemma) and (align_wordform == None or align_wordform == current_wordform )): hit_found = True concatenated_layers = concatenated_layers + [left_context] + [current_lemma] + [current_pos] + [current_wordform] # build left context if hit_found == False: left_context = left_context + left_context_sep + current_wordform left_context_sep = " " concatenated_layers = concatenated_layers + [right_context] columns_lst = ['left context', 'lemma 0', 'pos 0', 'wordform 0', 'right context'] if len(concatenated_layers) == len(columns_lst): df_subtree = pd.DataFrame([concatenated_layers], columns=columns_lst) df = pd.concat( [df, df_subtree], sort=False, ignore_index=True ) df = df.fillna("") # _df_kwic is assigned instead of appended, so kwic() can be called multiple times self._df_kwic = df return self._df_kwic
[docs] def trees(self): ''' Get results (as nested objects) matching a treebank search query Returns: list of tree objects >>> treebank_obj = create_treebank(some_treebank).pattern(some_pattern).search() >>> list_of_trees = treebank_obj.trees() ''' self.check_search_performed() trees = _parse_treebank_xml(self._response) return trees
[docs]def create_treebank(name): ''' API constructor Args: name: Name of the treebank (eg. 'cgn', 'lassy', ...) Returns: TreebankQuery object >>> treebank_obj = create_treebank(some_treebank).pattern(some_pattern).search() >>> df = treebank_obj.kwic() ''' return TreebankQuery(name)
[docs]def get_available_treebanks(): ''' This function returns the list of the available treebanks Returns: list of treebanks names strings ''' return list(constants.AVAILABLE_TREEBANKS.keys())