Source code for capreolus.searcher.anserini

import math
import os
import subprocess

import numpy as np

from capreolus import ConfigOption, Dependency, constants
from capreolus.utils.common import Anserini
from capreolus.utils.loginit import get_logger

from . import Searcher

[docs]logger = get_logger(__name__) # pylint: disable=invalid-name
[docs]MAX_THREADS = constants["MAX_THREADS"]
[docs]def list2str(l, delimiter="-"): return delimiter.join(str(x) for x in l)
[docs]class AnseriniSearcherMixIn: """MixIn for searchers that use Anserini's SearchCollection script"""
[docs] dependencies = [Dependency(key="index", module="index", name="anserini")]
def _anserini_query_from_file(self, topicsfn, anserini_param_str, output_base_path): if not os.path.exists(topicsfn): raise IOError(f"could not find topics file: {topicsfn}") donefn = os.path.join(output_base_path, "done") if os.path.exists(donefn): logger.debug(f"skipping Anserini SearchCollection call because path already exists: {donefn}") return # create index if it does not exist. the call returns immediately if the index does exist. self.index.create_index() os.makedirs(output_base_path, exist_ok=True) output_path = os.path.join(output_base_path, "searcher") index_path = self.index.get_index_path() anserini_fat_jar = Anserini.get_fat_jar() cmd = [ "java", "-classpath", anserini_fat_jar, "-Xms512M", "-Xmx31G", "", "", "-topicreader", "TsvString", "-index", index_path, "-topics", topicsfn, "-output", output_path, "-inmem", "-threads", str(MAX_THREADS), "-stemmer", "none" if self.index.config["stemmer"] is None else self.index.config["stemmer"], ] + anserini_param_str.split() if self.index.config["indexstops"]: cmd += ["-keepStopwords"]"Anserini writing runs to %s", output_path) logger.debug(cmd) app = subprocess.Popen(cmd, stdout=subprocess.PIPE, universal_newlines=True) # Anserini output is verbose, so ignore DEBUG log lines and send other output through our logger for line in app.stdout: Anserini.filter_and_log_anserini_output(line, logger) app.wait() if app.returncode != 0: raise RuntimeError("command failed") with open(donefn, "wt") as donef: print("done", file=donef)
[docs]class PostprocessMixin: def _keep_topn(self, runs, topn): queries = sorted(list(runs.keys()), key=lambda k: int(k)) for q in queries: docs = runs[q] if len(docs) <= topn: continue docs = sorted(docs.items(), key=lambda kv: kv[1], reverse=True)[:topn] runs[q] = {k: v for k, v in docs} return runs
[docs] def filter(self, run_dir, docs_to_remove=None, docs_to_keep=None, topn=None): if (not docs_to_keep) and (not docs_to_remove): raise for fn in os.listdir(run_dir): if fn == "done": continue run_fn = os.path.join(run_dir, fn) self._filter(run_fn, docs_to_remove, docs_to_keep, topn) return run_dir
def _filter(self, runfile, docs_to_remove, docs_to_keep, topn): runs = Searcher.load_trec_run(runfile) # filtering if docs_to_remove: # prioritize docs_to_remove if isinstance(docs_to_remove, list): docs_to_remove = {q: docs_to_remove for q in runs} runs = {q: {d: v for d, v in docs.items() if d not in docs_to_remove.get(q, [])} for q, docs in runs.items()} elif docs_to_keep: if isinstance(docs_to_keep, list): docs_to_keep = {q: docs_to_keep for q in runs} runs = {q: {d: v for d, v in docs.items() if d in docs_to_keep[q]} for q, docs in runs.items()} if topn: runs = self._keep_topn(runs, topn) Searcher.write_trec_run(runs, runfile) # overwrite runfile
[docs] def dedup(self, run_dir, topn=None): for fn in os.listdir(run_dir): if fn == "done": continue run_fn = os.path.join(run_dir, fn) self._dedup(run_fn, topn) return run_dir
def _dedup(self, runfile, topn): runs = Searcher.load_trec_run(runfile) new_runs = {q: {} for q in runs} # use the sum of each passage score as the document score, no sorting is done here for q, psg in runs.items(): for pid, score in psg.items(): docid = pid.split(".")[0] new_runs[q][docid] = max(new_runs[q].get(docid, -math.inf), score) runs = new_runs if topn: runs = self._keep_topn(runs, topn) Searcher.write_trec_run(runs, runfile)
[docs]class BM25(AnseriniSearcherMixIn, Searcher): """Anserini BM25. This searcher's parameters can also be specified as lists indicating parameters to grid search (e.g., ``"0.4,0.6,0.8,1.0"`` or ``"0.4..1,0.2"``)."""
[docs] module_name = "BM25"
[docs] config_spec = [ ConfigOption("k1", 0.9, "controls term saturation", value_type="floatlist"), ConfigOption("b", 0.4, "controls document length normalization", value_type="floatlist"), ConfigOption("hits", 1000, "number of results to return"),
] def _query_from_file(self, topicsfn, output_path, config): """ Runs BM25 search. Takes a query from the topic files, and fires it against the index Args: topicsfn: Path to a topics file output_path: Path where the results of the search (i.e the run file) should be stored Returns: Path to the run file where the results of the search are stored """ bstr, k1str = list2str(config["b"], delimiter=" "), list2str(config["k1"], delimiter=" ") hits = config["hits"] anserini_param_str = f"-bm25 -bm25.b {bstr} -bm25.k1 {k1str} -hits {hits}" self._anserini_query_from_file(topicsfn, anserini_param_str, output_path) return output_path
[docs]class BM25Grid(AnseriniSearcherMixIn, Searcher): """Deprecated. BM25 with a grid search for k1 and b. Search is from 0.1 to bmax/k1max in 0.1 increments"""
[docs] module_name = "BM25Grid"
[docs] config_spec = [ ConfigOption("k1max", 1.0, "maximum k1 value to include in grid search (starting at 0.1)"), ConfigOption("bmax", 1.0, "maximum b value to include in grid search (starting at 0.1)"), ConfigOption("hits", 1000, "number of results to return"),
] def _query_from_file(self, topicsfn, output_path, config): bs = np.around(np.arange(0.1, config["bmax"] + 0.1, 0.1), 1) k1s = np.around(np.arange(0.1, config["k1max"] + 0.1, 0.1), 1) bstr = " ".join(str(x) for x in bs) k1str = " ".join(str(x) for x in k1s) hits = config["hits"] anserini_param_str = f"-bm25 -bm25.b {bstr} -bm25.k1 {k1str} -hits {hits}" self._anserini_query_from_file(topicsfn, anserini_param_str, output_path) return output_path
[docs]class BM25RM3(AnseriniSearcherMixIn, Searcher): """Anserini BM25 with RM3 expansion. This searcher's parameters can also be specified as lists indicating parameters to grid search (e.g., ``"0.4,0.6,0.8,1.0"`` or ``"0.4..1,0.2"``)."""
[docs] module_name = "BM25RM3"
[docs] config_spec = [ ConfigOption("k1", "0.9", "controls term saturation", value_type="floatlist"), ConfigOption("b", "0.4", "controls document length normalization", value_type="floatlist"), ConfigOption("fbTerms", [5, 25], "number of generated terms from feedback", value_type="intlist"), ConfigOption("fbDocs", [5, 10], "number of documents used for feedback", value_type="intlist"), ConfigOption("originalQueryWeight", [0.5], "the weight of unexpended query", value_type="floatlist"), ConfigOption("hits", 1000, "number of results to return"),
] def _query_from_file(self, topicsfn, output_path, config): hits = str(config["hits"]) anserini_param_str = ( "-rm3 " + " ".join(f"-rm3.{k} {list2str(config[k], ' ')}" for k in ["fbTerms", "fbDocs", "originalQueryWeight"]) + " -bm25 " + " ".join(f"-bm25.{k} {list2str(config[k], ' ')}" for k in ["k1", "b"]) + f" -hits {hits}" ) self._anserini_query_from_file(topicsfn, anserini_param_str, output_path) return output_path
[docs]class BM25PostProcess(BM25, PostprocessMixin):
[docs] module_name = "BM25Postprocess"
[docs] config_spec = [ ConfigOption("k1", 0.9, "controls term saturation", value_type="floatlist"), ConfigOption("b", 0.4, "controls document length normalization", value_type="floatlist"), ConfigOption("hits", 1000, "number of results expected from the core searcher"), ConfigOption("topn", 1000, "number of results expected after the filtering (if any)"), ConfigOption("dedup", False),
[docs] def query_from_file(self, topicsfn, output_path, docs_to_remove=None): output_path = super().query_from_file(topicsfn, output_path) # will call _query_from_file() from BM25 if docs_to_remove: output_path = self.filter(output_path, docs_to_remove=docs_to_remove, topn=self.config["topn"]) if self.config["dedup"]: output_path = self.dedup(output_path, topn=self.config["topn"]) return output_path
[docs]class StaticRun(Searcher): def _query_from_file(self, topicsfn, output_path, config): import shutil outfn = os.path.join(output_path, "") if not os.path.exists(outfn): os.makedirs(output_path, exist_ok=True) shutil.copy2(constants["PACKAGE_PATH"] / "data" / self.run_fn, outfn) return output_path
[docs] def query(self, *args, **kwargs): raise NotImplementedError("this searcher uses a static run file, so it cannot handle new queries")
[docs]class StaticBM25RM3Rob04Yang19(StaticRun): """Tuned BM25+RM3 run used by Yang et al. in [1]. This should be used only with a benchmark using the same folds and queries. [1] Wei Yang, Kuang Lu, Peilin Yang, and Jimmy Lin. Critically Examining the "Neural Hype": Weak Baselines and the Additivity of Effectiveness Gains from Neural Ranking Models. SIGIR 2019. """
[docs] module_name = "bm25staticrob04yang19"
[docs] run_fn = ""
[docs]class StaticBM25RM3Rob04Yang19Desc(StaticRun): """Tuned BM25+RM3 robust04 description run on the folds used by Yang et al. in [1]. This should be used only with a benchmark using the same folds and queries. [1] Wei Yang, Kuang Lu, Peilin Yang, and Jimmy Lin. Critically Examining the "Neural Hype": Weak Baselines and the Additivity of Effectiveness Gains from Neural Ranking Models. SIGIR 2019. """
[docs] module_name = "bm25staticrob04yang19desc"
[docs] run_fn = ""
[docs]class StaticBM25Rob04Huston14Title(StaticRun):
[docs] module_name = "bm25staticrob04huston14title"
[docs] run_fn = ""
[docs]class StaticBM25Rob04Huston14Desc(StaticRun):
[docs] module_name = "bm25staticrob04huston14desc"
[docs] run_fn = ""
[docs]class StaticBM25Gov2(StaticRun):
[docs] module_name = "bm25staticgov2"
[docs] run_fn = ""
[docs]class StaticBM25Gov2Desc(StaticRun):
[docs] module_name = "bm25staticgov2desc"
[docs] run_fn = ""
[docs]class StaticBM25Genomics(StaticRun):
[docs] module_name = "bm25staticgenomics"
[docs] run_fn = ""
[docs]class StaticBM25CDS(StaticRun): """CDS BM25 run with k1=4.0, b=0.6 and new CDS 2016 documents removed from the 2014 and 2015 queries"""
[docs] module_name = "bm25staticcds"
[docs] run_fn = ""
[docs]class StaticCovidUdelAbstract(StaticRun):
[docs] module_name = "qdelstaticcovidabstract"
[docs] run_fn = "anserini.covid-r5.abstract.qdel.bm25-top1k.txt"
[docs]class StaticRM3TitleCore18(StaticRun):
[docs] module_name = "rm3staticcore18title"
[docs] run_fn = ""
[docs]class StaticRM3DescCore18(StaticRun):
[docs] module_name = "rm3staticcore18desc"
[docs] run_fn = ""
[docs]class BM25PRF(AnseriniSearcherMixIn, Searcher): """Anserini BM25 PRF. This searcher's parameters can also be specified as lists indicating parameters to grid search (e.g., ``"0.4,0.6,0.8,1.0"`` or ``"0.4..1,0.2"``)."""
[docs] module_name = "BM25PRF"
[docs] config_spec = [ ConfigOption("k1", [0.65, 0.70, 0.75], "controls term saturation", value_type="floatlist"), ConfigOption("b", [0.60, 0.7], "controls document length normalization", value_type="floatlist"), ConfigOption("fbTerms", [65, 70, 95, 100], "number of generated terms from feedback", value_type="intlist"), ConfigOption("fbDocs", [5, 10, 15], "number of documents used for feedback", value_type="intlist"), ConfigOption("newTermWeight", [0.2, 0.25], value_type="floatlist"), ConfigOption("hits", 1000, "number of results to return"),
] def _query_from_file(self, topicsfn, output_path, config): hits = str(config["hits"]) anserini_param_str = ( "-bm25prf " + " ".join(f"-bm25prf.{k} {list2str(config[k], ' ')}" for k in ["fbTerms", "fbDocs", "newTermWeight", "k1", "b"]) + " -bm25 " + " ".join(f"-bm25.{k} {list2str(config[k], ' ')}" for k in ["k1", "b"]) + f" -hits {hits}" ) print(output_path) self._anserini_query_from_file(topicsfn, anserini_param_str, output_path) return output_path
[docs]class AxiomaticSemanticMatching(AnseriniSearcherMixIn, Searcher): """Anserini BM25 with Axiomatic query expansion. This searcher's parameters can also be specified as lists indicating parameters to grid search (e.g., ``"0.4,0.6,0.8,1.0"`` or ``"0.4..1,0.2"``)."""
[docs] module_name = "axiomatic"
[docs] config_spec = [ ConfigOption("k1", 0.9, "controls term saturation", value_type="floatlist"), ConfigOption("b", 0.4, "controls document length normalization", value_type="floatlist"), ConfigOption("r", 20, value_type="intlist"), ConfigOption("n", 30, value_type="intlist"), ConfigOption("beta", 0.4, value_type="floatlist"), ConfigOption("top", 20, value_type="intlist"), ConfigOption("hits", 1000, "number of results to return"),
] def _query_from_file(self, topicsfn, output_path, config): hits = str(config["hits"]) anserini_param_str = "-axiom -axiom.deterministic -axiom.r {0} -axiom.n {1} -axiom.beta {2} {3}".format( *[list2str(config[k], " ") for k in ["r", "n", "beta", "top"]] ) anserini_param_str += " -bm25 -bm25.k1 {0} -bm25.b {1} ".format(*[list2str(config[k], " ") for k in ["k1", "b"]]) anserini_param_str += f" -hits {hits}" self._anserini_query_from_file(topicsfn, anserini_param_str, output_path) return output_path
[docs]class DirichletQL(AnseriniSearcherMixIn, Searcher): """Anserini QL with Dirichlet smoothing. This searcher's parameters can also be specified as lists indicating parameters to grid search (e.g., ``"0.4,0.6,0.8,1.0"`` or ``"0.4..1,0.2"``)."""
[docs] module_name = "DirichletQL"
[docs] config_spec = [ ConfigOption("mu", 1000, "smoothing parameter", value_type="intlist"), ConfigOption("hits", 1000, "number of results to return"),
] def _query_from_file(self, topicsfn, output_path, config): """ Runs Dirichlet QL search. Takes a query from the topic files, and fires it against the index Args: topicsfn: Path to a topics file output_path: Path where the results of the search (i.e the run file) should be stored Returns: Path to the run file where the results of the search are stored """ mustr = list2str(config["mu"], delimiter=" ") hits = config["hits"] anserini_param_str = f"-qld {mustr} -hits {hits}" self._anserini_query_from_file(topicsfn, anserini_param_str, output_path) return output_path
[docs]class QLJM(AnseriniSearcherMixIn, Searcher): """Anserini QL with Jelinek-Mercer smoothing. This searcher's parameters can also be specified as lists indicating parameters to grid search (e.g., ``"0.4,0.6,0.8,1.0"`` or ``"0.4..1,0.2"``)."""
[docs] module_name = "QLJM"
[docs] config_spec = [ConfigOption("lam", 0.1, value_type="floatlist"), ConfigOption("hits", 1000, "number of results to return")]
def _query_from_file(self, topicsfn, output_path, config): anserini_param_str = "-qljm -qljm.lambda {0} -hits {1}".format(list2str(config["lam"], delimiter=" "), config["hits"]) self._anserini_query_from_file(topicsfn, anserini_param_str, output_path) return output_path
[docs]class INL2(AnseriniSearcherMixIn, Searcher): """Anserini I(n)L2 scoring model. This searcher does not support list parameters."""
[docs] module_name = "INL2"
[docs] config_spec = [ ConfigOption("c", 0.1), # array input of this parameter is not support by anserini.SearchCollection ConfigOption("hits", 1000, "number of results to return"),
] def _query_from_file(self, topicsfn, output_path, config): anserini_param_str = "-inl2 -inl2.c {0} -hits {1}".format(config["c"], config["hits"]) self._anserini_query_from_file(topicsfn, anserini_param_str, output_path) return output_path
[docs]class SPL(AnseriniSearcherMixIn, Searcher): """ Anserini SPL scoring model. This searcher does not support list parameters. """
[docs] module_name = "SPL"
[docs] config_spec = [ ConfigOption("c", 0.1), # array input of this parameter is not support by anserini.SearchCollection ConfigOption("hits", 1000, "number of results to return"),
] def _query_from_file(self, topicsfn, output_path, config): anserini_param_str = "-spl -spl.c {0} -hits {1}".format(config["c"], config["hits"]) self._anserini_query_from_file(topicsfn, anserini_param_str, output_path) return output_path
[docs]class F2Exp(AnseriniSearcherMixIn, Searcher): """ F2Exp scoring model. This searcher does not support list parameters. """
[docs] module_name = "F2Exp"
[docs] config_spec = [ ConfigOption("s", 0.5), # array input of this parameter is not support by anserini.SearchCollection ConfigOption("hits", 1000, "number of results to return"),
] def _query_from_file(self, topicsfn, output_path, config): anserini_param_str = "-f2exp -f2exp.s {0} -hits {1}".format(config["s"], config["hits"]) self._anserini_query_from_file(topicsfn, anserini_param_str, output_path) return output_path
[docs]class F2Log(AnseriniSearcherMixIn, Searcher): """ F2Log scoring model. This searcher does not support list parameters. """
[docs] module_name = "F2Log"
[docs] config_spec = [ ConfigOption("s", 0.5), # array input of this parameter is not support by anserini.SearchCollection ConfigOption("hits", 1000, "number of results to return"),
] def _query_from_file(self, topicsfn, output_path, config): anserini_param_str = "-f2log -f2log.s {0} -hits {1}".format(config["s"], config["hits"]) self._anserini_query_from_file(topicsfn, anserini_param_str, output_path) return output_path
[docs]class SDM(AnseriniSearcherMixIn, Searcher): """ Anserini BM25 with the Sequential Dependency Model. This searcher supports list parameters for only k1 and b. """
[docs] module_name = "SDM"
# array input of (tw, ow, uw) is not support by anserini.SearchCollection
[docs] config_spec = [ ConfigOption("k1", 0.9, "controls term saturation", value_type="floatlist"), ConfigOption("b", 0.4, "controls document length normalization", value_type="floatlist"), ConfigOption("tw", 0.85, "term weight"), ConfigOption("ow", 0.15, "ordered window weight"), ConfigOption("uw", 0.05, "unordered window weight"), ConfigOption("hits", 1000, "number of results to return"),
] def _query_from_file(self, topicsfn, output_path, config): hits = config["hits"] anserini_param_str = "-sdm {0} -sdm.ow {1} -sdm.uw {2}".format(*[config[k] for k in ["tw", "ow", "uw"]]) anserini_param_str += " -bm25 -bm25.k1 {0} -bm25.b {1}".format(*[list2str(config[k], " ") for k in ["k1", "b"]]) anserini_param_str += f" -hits {hits}" self._anserini_query_from_file(topicsfn, anserini_param_str, output_path) return output_path