import os
import shutil
from capreolus import constants
from capreolus.utils.common import download_file, hash_file
from capreolus.utils.loginit import get_logger
from . import Collection
[docs]logger = get_logger(__name__)
[docs]PACKAGE_PATH = constants["PACKAGE_PATH"]
@Collection.register
[docs]class ANTIQUE(Collection):
"""A Non-factoid Question Answering Benchmark from Hashemi et al. [1]
[1] Helia Hashemi, Mohammad Aliannejadi, Hamed Zamani, and W. Bruce Croft. 2020. ANTIQUE: A non-factoid question answering benchmark. ECIR 2020.
"""
[docs] module_name = "antique"
_path = PACKAGE_PATH / "data" / "antique-collection"
[docs] collection_type = "TrecCollection"
[docs] generator_type = "DefaultLuceneDocumentGenerator"
[docs] def download_if_missing(self):
url = "http://ciir.cs.umass.edu/downloads/Antique/antique-collection.txt"
cachedir = self.get_cache_path()
document_dir = os.path.join(cachedir, "documents")
coll_filename = os.path.join(document_dir, "antique-collection.txt")
if os.path.exists(coll_filename):
return document_dir
tmp_dir = cachedir / "tmp"
tmp_filename = os.path.join(tmp_dir, "tmp.anqique.file")
os.makedirs(tmp_dir, exist_ok=True)
os.makedirs(document_dir, exist_ok=True)
download_file(url, tmp_filename, expected_hash="68b6688f5f2668c93f0e8e43384f66def768c4da46da4e9f7e2629c1c47a0c36")
self._convert_to_trec(inp_path=tmp_filename, outp_path=coll_filename)
logger.info(f"antique collection file prepared, stored at {coll_filename}")
for file in os.listdir(tmp_dir): # in case there are legacy files
os.remove(os.path.join(tmp_dir, file))
shutil.rmtree(tmp_dir)
return document_dir
def _convert_to_trec(self, inp_path, outp_path):
assert os.path.exists(inp_path)
fout = open(outp_path, "wt", encoding="utf-8")
with open(inp_path, "rt", encoding="utf-8") as f:
for line in f:
docid, doc = line.strip().split("\t")
fout.write(f"<DOC>\n<DOCNO>{docid}</DOCNO>\n<TEXT>\n{doc}\n</TEXT>\n</DOC>\n")
fout.close()
logger.debug(f"Converted file {os.path.basename(inp_path)} to TREC format, output to: {outp_path}")
def _validate_document_path(self, path):
"""Checks that the sha256sum is correct"""
return (
hash_file(os.path.join(path, "antique-collection.txt"))
== "409e0960f918970977ceab9e5b1d372f45395af25d53b95644bdc9ccbbf973da"
)