match.py

import xml.etree.ElementTree as ET
from collections import defaultdict

def topic_extractor(isEven=False):
    # Topics in dictionary
    topics = ET.parse('topics-rnd5.xml')
    root = topics.getroot()
    even_topics = [{'topic_id': int(topic.attrib['number']),
                    'query': topic[0].text,
                    'question': topic[1].text,
                    'narrative': topic[2].text} for topic in root if int(topic.attrib['number']) % 2 == 0]

    odd_topics = [{'topic_id': int(topic.attrib['number']),
                'query': topic[0].text,
                'question': topic[1].text,
                'narrative': topic[2].text} for topic in root  if int(topic.attrib['number']) % 2 == 1]

    if isEven:
        return even_topics
    else:
        return odd_topics

def topic_relevancy_extractor():
    # Only Docs with Relevancy Judgements
    topic_with_relevancy = defaultdict(list)

    with open("relevancy-judgements.txt", "r") as f:
        text = f.read()
        text = text.split('\n')
        for line in text:
            items = line.split()

            topic_id = int(items[0])
            cord_uid = items[2]
            relevancy = int(items[3])

            topic_with_relevancy[cord_uid].append({
                'topic_id' : topic_id,
                'relevancy': relevancy
            })
    return topic_with_relevancy