Source code for capreolus.utils.irds

import json

import ir_datasets

from capreolus import ConfigOption
from capreolus.benchmark import Benchmark, IRDBenchmark
from capreolus.collection import Collection, IRDCollection


[docs]def dataset_to_collection(name):
    # adapted from https://github.com/Georgetown-IR-Lab/OpenNIR/blob/master/onir/datasets/irds.py#L47
    # HACK: find "parent" dataset that contains same docs handler so we don't re-build the index for the same collection
    ds = ir_datasets.load(name)
    segments = name.split("/")
    docs_handler = ds.docs_handler()
    parent_docs_ds = name
    while len(segments) > 1:
        segments = segments[:-1]
        parent_ds = ir_datasets.load("/".join(segments))
        if parent_ds.has_docs() and parent_ds.docs_handler() == docs_handler:
            parent_docs_ds = "/".join(segments)
    return parent_docs_ds


[docs]def get_irds(dataset, query_type, fields):
    if isinstance(fields, str):
        fields = [fields]

    if isinstance(dataset, str):
        dataset = [dataset]

    collection_datasets = {dataset_to_collection(name) for name in dataset}
    assert len(collection_datasets) == 1
    collection_dataset = list(collection_datasets)[0]

    @Collection.register
    class DynamicIRDCollection(IRDCollection):
        module_name = collection_dataset
        ird_dataset_name = collection_dataset
        config_spec = [ConfigOption("fields", ["body"], "fields to index", value_type="strlist")]
        collection_type = "JsonCollection"

        def doc_as_json(self, doc):
            content = " ".join((getattr(doc, field) for field in self.config["fields"]))
            return json.dumps({"id": doc.doc_id, "contents": content})

    @Benchmark.register
    class DynamicIRDBenchmark(IRDBenchmark):
        module_name = ",".join(dataset)
        ird_dataset_names = dataset
        config_spec = [ConfigOption("query_type", "title")]

        @property
        def query_type(self):
            return self.config["query_type"]

        @property
        def queries(self):
            return self.topics[self.query_type]

    return DynamicIRDCollection({"fields": fields}), DynamicIRDBenchmark({"query_type": query_type})