Source code for capreolus.collection.robust04

import os
import shutil
import tarfile

from . import Collection
from capreolus import ModuleBase, Dependency, ConfigOption, constants
from capreolus.utils.common import download_file, hash_file, remove_newline
from capreolus.utils.loginit import get_logger
from capreolus.utils.trec import anserini_index_to_trec_docs, document_to_trectxt

[docs]logger = get_logger(__name__)
[docs]PACKAGE_PATH = constants["PACKAGE_PATH"]
[docs]@Collection.register class Robust04(Collection): """ TREC Robust04 (TREC disks 4 and 5 without the Congressional Record documents) """
[docs] module_name = "robust04"
[docs] collection_type = "TrecCollection"
[docs] generator_type = "DefaultLuceneDocumentGenerator"
[docs] config_keys_not_in_path = ["path"]
[docs] config_spec = [ConfigOption("path", "Aquaint-TREC-3-4", "path to corpus")]
[docs] def download_if_missing(self): return self.download_index( url="", sha256="dddb81f16d70ea6b9b0f94d6d6b888ed2ef827109a14ca21fd82b2acd6cbd450", index_directory_inside="index-robust04-20191213/", # this string should match how the index was built (i.e., Anserini, stopwords removed, Porter stemming) index_cache_path_string="index-anserini_indexstops-False_stemmer-porter", index_expected_document_count=528_030, cachedir=self.get_cache_path(),
) def _validate_document_path(self, path): """ Validate that the document path appears to contain robust04's documents (Aquaint-TREC-3-4). Validation is performed by looking for four directories (case-insensitive): `FBIS`, `FR94`, `FT`, and `LATIMES`. These directories may either be at the root of `path` or they may be in `path/NEWS_data` (case-insensitive). Returns: True if the Aquaint-TREC-3-4 document directories are found or False if not """ if not os.path.isdir(path): return False contents = {fn.lower(): fn for fn in os.listdir(path)} if "news_data" in contents: contents = {fn.lower(): fn for fn in os.listdir(os.path.join(path, contents["news_data"]))} if "fbis" in contents and "fr94" in contents and "ft" in contents and "latimes" in contents: return True return False
[docs] def download_index( self, cachedir, url, sha256, index_directory_inside, index_cache_path_string, index_expected_document_count ): # Download the collection from URL and extract into a path in the cache directory. # To avoid re-downloading every call, we create an empty '/done' file in this directory on success. done_file = os.path.join(cachedir, "done") document_dir = os.path.join(cachedir, "documents") # already downloaded? if os.path.exists(done_file): return document_dir # 1. Download and extract Anserini index to a temporary location tmp_dir = os.path.join(cachedir, "tmp_download") archive_file = os.path.join(tmp_dir, "archive_file") os.makedirs(document_dir, exist_ok=True) os.makedirs(tmp_dir, exist_ok=True)"downloading index for missing collection %s to temporary file %s", self.module_name, archive_file) download_file(url, archive_file, expected_hash=sha256)"extracting index to %s (before moving to correct cache path)", tmp_dir) with as tar: tar.extractall(path=tmp_dir) extracted_dir = os.path.join(tmp_dir, index_directory_inside) if not (os.path.exists(extracted_dir) and os.path.isdir(extracted_dir)): raise ValueError(f"could not find expected index directory {extracted_dir} in {tmp_dir}") # 2. Move index to its correct location in the cache index_dir = os.path.join(cachedir, index_cache_path_string, "index") if not os.path.exists(os.path.join("index_dir", "done")): if os.path.exists(index_dir): shutil.rmtree(index_dir) shutil.move(extracted_dir, index_dir) # 3. Extract raw documents from the Anserini index to document_dir anserini_index_to_trec_docs(index_dir, document_dir, index_expected_document_count) # remove temporary files and create a /done we can use to verify extraction was successful shutil.rmtree(tmp_dir) with open(done_file, "wt") as outf: print("", file=outf) return document_dir