Source code for chaininglib.search.metadata

import xml.etree.ElementTree as ET
import chaininglib.constants as constants
import re
import requests

from chaininglib.search.lexiconQueries import lexicon_query


[docs]def get_available_metadata(resource_name, resource_type=None): ''' Return all possible metadata fields for a lexicon or corpus Args: resource_name: Name of the lexicon or corpus resource_type: (optional) One of 'lexicon' or 'corpus'. Can be used to disambiguate when resource name can be both a lexicon or corpus Returns: A dictionary of lists of document and token metadata (corpus) or a list of metadata fields (lexicon) >>> corpus_metadata = get_available_metadata("zeebrieven") >>> print(corpus_metadata) >>> {'document': ['aantal_paginas', 'aantal_woorden', ..., 'witnessYear_from', 'witnessYear_to'], 'token': ['word', 'lemma', 'pos', 'punct', 'starttag']} >>> lexicon_metadata = get_available_metadata("molex") >>> print(lexicon_metadata) >>> ['lemEntryId', 'lemma', 'lemPos', 'wordformId', 'wordform', 'hyphenation', 'wordformPos', 'Gender', 'Number'] ''' # Infer resource type from name if resource_name in constants.AVAILABLE_CORPORA and resource_name not in constants.AVAILABLE_LEXICA: res_type = "corpus" elif resource_name in constants.AVAILABLE_LEXICA and resource_name not in constants.AVAILABLE_CORPORA: res_type = "lexicon" elif resource_name in constants.AVAILABLE_LEXICA and resource_name in constants.AVAILABLE_CORPORA: if resource_type is not None: res_type = resource_type else: raise ValueError("Resource " + resource_name + " can be a corpus or lexicon. Please specify the resource_type.") else: raise ValueError("Resource " + resource_name + " not found.") if res_type=="lexicon": # Create sample query for this lexicon q = lexicon_query(word="", pos="", lexicon=resource_name) return _metadata_from_lexicon_query(q) elif res_type=="corpus": if resource_name in constants.AVAILABLE_CORPORA and resource_name != "nederlab": return _corpus_metadata_blacklab(resource_name) elif corpus_name=="nederlab": print("Corpus metadata not yet available for Nederlab") return [] else: raise ValueError("resource_type should be 'corpus' or 'lexicon'.")
def _corpus_metadata_blacklab(corpus_name): ''' Return all possible metadata fields for a BlackLab-based corpus, by sending a request to the corpus Args: corpus_name: Name of the corpus Returns: A dictionary of with lists of document and token metadata ''' corpus_url = constants.AVAILABLE_CORPORA[corpus_name]["blacklab_url"] response = requests.get(corpus_url) response_text = response.text return _parse_blacklab_metadata(response_text) # TODO: Dependent on Blacklab indexing which has been performed, this method could token fields which are extracted from POS tag by FCS (eg. inflection) def _parse_blacklab_metadata(text): ''' This method parses metadata fields from a Blacklab metadata response Args: text: the XML response of a lexicon/corpus search, as a string Returns: A dictionary of lists of document and token metadata ''' # TODO: should we secure against untrusted XML? root = ET.fromstring(text) doc_fields = [md.get("name") for md in root.iter("metadataField")] # TODO: Maybe irrelevant but pay attention: after Blacklab update, token fields are under annotatedFields instead of complexFields token_fields = [prop.get("name") for prop in root.iter("property")] return {"document": doc_fields, "token": token_fields} def _metadata_from_lexicon_query(lex_query): ''' Extract metadata fields from a lexicon query string Args: lex_query: A query string issued to a lexicon, can be constructed using lexicon_query() Returns: A list of metadata fields ''' # Get part after select, eg: "?x ?y ?concat('',z) as ?a" select_match = re.search(r'select\s+(?:distinct)*\s*(.*)\s*(?:where|from)', lex_query, flags=re.IGNORECASE) if select_match: select_string = select_match.group(1) #Delete concat() part and following AS, because it can contain a space we do not want to split on string_wh_concat = re.sub(r'concat\(.*\) AS', '', select_string, flags=re.IGNORECASE) split_string = string_wh_concat.split() for i,elem in enumerate(split_string): if elem.lower()=="AS": # Remove AS and element before AS split_string.pop(i) split_string.pop(i-1) # Assume only one AS, so we escape loop break columns = [c.lstrip("?") for c in split_string] else: raise ValueError("No columns find in lexicon query.") return columns