import urllib
import requests
import copy
import chaininglib.constants as constants
import chaininglib.ui.status as status
import chaininglib.search.corpusHelpers as corpusHelpers
import chaininglib.search.corpusQueries as corpusQueries
import pandas as pd
import sys
from chaininglib.search.GeneralQuery import GeneralQuery
[docs]class CorpusQuery(GeneralQuery):
""" A query on a token-based corpus. """
def __init__(self, resource, pattern = None, lemma = None, word=None, pos=None, detailed_context = False, extra_fields_doc = [], extra_fields_token = [], start_position = 0, max_results= sys.maxsize, metadata_filter={}, method=None):
super().__init__(resource, pattern, lemma, word, pos)
self._detailed_context = detailed_context
self._extra_fields_doc = extra_fields_doc
self._extra_fields_token = extra_fields_token
self._start_position = start_position
self._max_results = max_results
self._maximum_result_number = self._start_position + self._max_results
self._metadata_filter = metadata_filter
self._response = []
self._df_kwic = pd.DataFrame()
self._search_performed = False
if self._resource not in constants.AVAILABLE_CORPORA:
raise ValueError("Unknown corpus: " + self._resource)
if method is not None:
# If method supplied by user, use it
self._method = method
# Otherwise, use default method given in config
elif "default_method" in constants.AVAILABLE_CORPORA[self._resource]:
self._method = constants.AVAILABLE_CORPORA[self._resource]["default_method"]
# Last resort: try FCS
else:
self._method="fcs"
def __str__(self):
return 'CorpusQuery({0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9})'.format(
self._resource, self._pattern_given, self._lemma, self._word, self._pos, self._detailed_context, self._extra_fields_doc, self._extra_fields_token, self._start_position, self._metadata_filter, self._method)
[docs] def detailed_context(self, detailed_context=True):
'''
Request a CorpusQuery object to return a detailed context.
Args:
detailed_context: If True, every single tokens will be returned with multiple information layers (like lemma, wordfor, part-of-speech, ...). If False, only hits will have multiple information layers
Returns:
CorpusQuery object
'''
return self._copyWith('_detailed_context', detailed_context)
[docs] def start_position(self, start_position):
'''
Request a CorpusQuery object to return the stated page number of the whole result pages collection.
This option might not be used by users, but the search procedure needs this to be able to retrieve
full results, as those might be spread among more pages.
Args:
start_position: result page number to be requested.
Returns:
CorpusQuery object
'''
s1 = self._copyWith('_start_position', start_position)
s1._maximum_result_number = s1._start_position + s1._max_results
return s1
[docs] def max_results(self, max_results):
'''
Limit the maximum number of results returned.
Args:
max_results: maximum number of results.
Returns:
CorpusQuery object
'''
s1 = self._copyWith('_max_results', max_results)
s1._maximum_result_number = s1._start_position + s1._max_results
return s1
[docs] def method(self, method):
'''
Set method to make request
Args:
method: fcs (Federated Content Search) or blacklab
Returns:
CorpusQuery object
'''
return self._copyWith('_method', method)
[docs] def search(self):
'''
Request results matching a corpus search query
Returns:
CorpusQuery object
>>> # build a corpus search query
>>> corpus_obj = create_corpus(some_corpus).pattern(some_pattern)
>>> # get the results
>>> df = corpus_obj.search().kwic()
'''
# _pattern_given keeps unchanged so as to be able to call the same corpus object multiple times
# only _pattern is set differently if needed
if self._pattern_given:
if self._lemma or self._word or self._pos:
raise ValueError('When a pattern (%s) is given, lemma (%s), word (%s) and/or pos (%s) cannot be supplied too. Redundant!' % (self._pattern_given, self._lemma, self._word, self._pos))
else:
# Use pattern supplied by user
self._pattern = copy.copy(self._pattern_given)
else:
# Pattern will be built with lemma, word, pos
if self._lemma or self._word or self._pos:
self._pattern = corpusQueries.corpus_query(self._lemma, self._word, self._pos)
else:
# If nothing is given: complain
raise ValueError('A pattern OR a lemma/word/pos is required')
# FCS starts counting at 1. Adjust 0 (default start position) to 1.
# Other start positions, which are probably given deliberately, are left as is.
if self._method=="fcs" and self._start_position == 0:
self._start_position = 1
# show wait indicator
status.remove_wait_indicator()
status.show_wait_indicator('Searching '+self._resource+ ' at result '+str(self._start_position))
amount_to_fetch = min(constants.RECORDS_PER_PAGE, max(0,self._maximum_result_number - self._start_position))
try:
if self._method=="fcs":
# FCS does filtering on query results, so we have to request the filter fields in our query
self._extra_fields_doc = list(set(self._extra_fields_doc + list(self._metadata_filter.keys())))
# Do request to federated content search corpora, so we get same output format for every corpus
url = ( constants.FCS_URL +
"&maximumRecords=" + str(amount_to_fetch) +
"&startRecord=" + str(self._start_position) +
"&x-fcs-context=" + self._resource +
"&query=" + urllib.parse.quote_plus(self._pattern) )
elif self._method=="blacklab":
if "blacklab_url" not in constants.AVAILABLE_CORPORA[self._resource]:
raise ValueError("Blacklab access not available for this corpus.")
# Blacklab can filter metadata on server
lucene_filter = corpusHelpers._create_lucene_metadata_filter(self._metadata_filter)
url = ( constants.AVAILABLE_CORPORA[self._resource]["blacklab_url"] + "/hits?"
"&number=" + str(amount_to_fetch) +
"&first=" + str(self._start_position) +
"&patt=" + urllib.parse.quote(self._pattern) +
"&filter=" + urllib.parse.quote_plus(lucene_filter) )
else:
raise ValueError("Invalid request method: " + self._method + ". Should be one of: 'fcs' or 'blacklab'.")
response = requests.get(url)
response_text = response.text
self._response.append(response_text)
if self._method=="fcs":
df, next_page = corpusHelpers._parse_xml_fcs(response_text, self._detailed_context, self._extra_fields_doc, self._extra_fields_token)
elif self._method=="blacklab":
df, next_page = corpusHelpers._parse_xml_blacklab(response_text, self._detailed_context, self._extra_fields_doc, self._extra_fields_token)
# If there are next pages, call search_corpus recursively (could result in )
retrieved_so_far = self._start_position + len(df.index)
#print("# results now:" + str(retrieved_so_far) + " max: " + str(self._max_results))
if next_page > 0 and retrieved_so_far < self._max_results:
self._start_position = next_page
df_more = self.search().kwic()
df = df.append(df_more, ignore_index=True)
status.remove_wait_indicator()
# show message out of xml, if some error has occured (prevents empty output)
corpusHelpers._show_error_if_any(response_text)
# Filter results on metadata (performeed after query for FCS)
if self._method=="fcs":
if self._metadata_filter:
filters = corpusHelpers._create_pandas_metadata_filter(df, self._metadata_filter)
df = df[filters]
# Append new entries (df) to existing dataframe (self._df_kwic): this is relevant if calling this function for multiple search queries
df = df.fillna("")
# Convert column dtypes
if "witnessYear_from" in df:
df= df.astype({"witnessYear_from":"int32"})
if "witnessYear_to" in df:
df= df.astype({"witnessYear_to":"int32"})
#df = self._df_kwic.append(df, ignore_index=True)
self._search_performed = True
# Save dataframe in object, so it can be retrieved with .kwic()
return self._copyWith('_df_kwic', df)
except Exception as e:
status.remove_wait_indicator()
raise ValueError("An error occured when searching corpus " + self._resource + ": "+ str(e))
# OUTPUT
[docs] def xml(self):
'''
Get the XML response (unparsed) of a Corpus search
Returns:
XML string
>>> corpus_obj = create_corpus(some_corpus).pattern(some_pattern)
>>> xml = corpus_obj.search().xml()
'''
self.check_search_performed()
if self._method == "fcs" and self._metadata_filter:
raise ValueError("Retrieving xml not possible for method FCS in combination with metadata filters. Remove metadata filter and try again.")
return "\n".join(self._response)
[docs] def kwic(self):
'''
Get the Pandas DataFrame with one keyword in context (KWIC) per row
Returns:
Pandas DataFrame
'''
self.check_search_performed()
return self._df_kwic
[docs]def create_corpus(name):
'''
API constructor
Args:
name: corpus name
Returns:
CorpusQuery object
>>> corpus_obj = create_corpus(some_corpus).pattern(some_pattern)
>>> df = corpus_obj.search().kwic()
'''
return CorpusQuery(name)
[docs]def get_available_corpora(exclude=[]):
'''
This function returns the list of the available corpora
Returns:
list of corpus name strings
>>> # get list of corpora at our disposal and query each of them
>>> for one_corpus in get_available_corpora(exclude=["nederlab"]):
>>> c = create_corpus(one_corpus).lemma("woordenboek").detailed_context(True).search()
>>> df_corpus = c.kwic()
'''
return [x for x in list(constants.AVAILABLE_CORPORA.keys()) if x not in exclude]