Source code for chaininglib.search.metadata

import xml.etree.ElementTree as ET
import chaininglib.constants as constants
import re
import requests

from chaininglib.search.lexiconQueries import lexicon_query


[docs]def get_available_metadata(resource_name, resource_type=None):   
    '''
    Return all possible metadata fields for a lexicon or corpus
    
    Args:
        resource_name: Name of the lexicon or corpus
        resource_type: (optional) One of 'lexicon' or 'corpus'. Can be used to disambiguate when resource name can be both a lexicon or corpus
    Returns:
        A dictionary of lists of document and token metadata (corpus) or a list of metadata fields (lexicon)
    
    >>> corpus_metadata = get_available_metadata("zeebrieven")
    >>> print(corpus_metadata)
    >>> {'document': ['aantal_paginas', 'aantal_woorden', ...,  'witnessYear_from', 'witnessYear_to'], 'token': ['word', 'lemma', 'pos', 'punct', 'starttag']}
    >>> lexicon_metadata = get_available_metadata("molex")
    >>> print(lexicon_metadata)
    >>> ['lemEntryId', 'lemma', 'lemPos', 'wordformId', 'wordform', 'hyphenation', 'wordformPos', 'Gender', 'Number']
    '''
    
    # Infer resource type from name
    if resource_name in constants.AVAILABLE_CORPORA and resource_name not in constants.AVAILABLE_LEXICA:
        res_type = "corpus"
    elif resource_name in constants.AVAILABLE_LEXICA and resource_name not in constants.AVAILABLE_CORPORA:
        res_type = "lexicon"
    elif resource_name in constants.AVAILABLE_LEXICA and resource_name in constants.AVAILABLE_CORPORA:
        if resource_type is not None:
            res_type = resource_type
        else:
            raise ValueError("Resource " + resource_name + " can be a corpus or lexicon. Please specify the resource_type.")
    else:
        raise ValueError("Resource " + resource_name + " not found.")
    
    
    if res_type=="lexicon":
        # Create sample query for this lexicon
        q = lexicon_query(word="", pos="", lexicon=resource_name)
        return _metadata_from_lexicon_query(q)
    elif res_type=="corpus":
        if resource_name in constants.AVAILABLE_CORPORA and resource_name != "nederlab":
            return _corpus_metadata_blacklab(resource_name)
        elif corpus_name=="nederlab":
            print("Corpus metadata not yet available for Nederlab")
            return []
    else:
        raise ValueError("resource_type should be 'corpus' or 'lexicon'.")
        
        
        
        
def _corpus_metadata_blacklab(corpus_name):
    '''
    Return all possible metadata fields for a BlackLab-based corpus, by sending a request to the corpus
    
    Args:
        corpus_name: Name of the corpus
    Returns:
        A dictionary of with lists of document and token metadata
    '''
    corpus_url = constants.AVAILABLE_CORPORA[corpus_name]["blacklab_url"]
    response = requests.get(corpus_url)
    response_text = response.text  
    return _parse_blacklab_metadata(response_text)



# TODO: Dependent on Blacklab indexing which has been performed, this method could token fields which are extracted from POS tag by FCS (eg. inflection)
def _parse_blacklab_metadata(text):
    '''
    This method parses metadata fields from a Blacklab metadata response
    Args:
        text: the XML response of a lexicon/corpus search, as a string
    Returns:
        A dictionary of lists of document and token metadata
    '''
    
    # TODO: should we secure against untrusted XML?
    root = ET.fromstring(text)
    doc_fields = [md.get("name") for md in root.iter("metadataField")]
    # TODO: Maybe irrelevant but pay attention: after Blacklab update, token fields are under annotatedFields instead of complexFields
    token_fields = [prop.get("name") for prop in root.iter("property")]
    return {"document": doc_fields, "token": token_fields}
    


def _metadata_from_lexicon_query(lex_query):
    '''
    Extract metadata fields from a lexicon query string
    
    Args:
        lex_query: A query string issued to a lexicon, can be constructed using lexicon_query()
    Returns:
        A list of metadata fields
    '''
    # Get part after select, eg: "?x ?y ?concat('',z) as ?a"
    select_match = re.search(r'select\s+(?:distinct)*\s*(.*)\s*(?:where|from)', lex_query, flags=re.IGNORECASE)
    if select_match:
        select_string = select_match.group(1)
        #Delete concat() part and following AS, because it can contain a space we do not want to split on
        string_wh_concat = re.sub(r'concat\(.*\) AS', '', select_string, flags=re.IGNORECASE)
        split_string = string_wh_concat.split()
        for i,elem in enumerate(split_string):
            if elem.lower()=="AS":
                # Remove AS and element before AS
                split_string.pop(i)
                split_string.pop(i-1)
                # Assume only one AS, so we escape loop
                break
        columns = [c.lstrip("?") for c in split_string]
    else:
        raise ValueError("No columns find in lexicon query.")
    return columns