Source code for capreolus.searcher.anserini

import math
import os
import subprocess

import numpy as np

from capreolus import ConfigOption, Dependency, constants
from capreolus.utils.common import Anserini
from capreolus.utils.loginit import get_logger

from . import Searcher

[docs]logger = get_logger(__name__)  # pylint: disable=invalid-name
[docs]MAX_THREADS = constants["MAX_THREADS"]


[docs]def list2str(l, delimiter="-"):
    return delimiter.join(str(x) for x in l)


[docs]class AnseriniSearcherMixIn:
    """MixIn for searchers that use Anserini's SearchCollection script"""

[docs]    dependencies = [Dependency(key="index", module="index", name="anserini")]

    def _anserini_query_from_file(self, topicsfn, anserini_param_str, output_base_path):
        if not os.path.exists(topicsfn):
            raise IOError(f"could not find topics file: {topicsfn}")

        donefn = os.path.join(output_base_path, "done")
        if os.path.exists(donefn):
            logger.debug(f"skipping Anserini SearchCollection call because path already exists: {donefn}")
            return

        # create index if it does not exist. the call returns immediately if the index does exist.
        self.index.create_index()

        os.makedirs(output_base_path, exist_ok=True)
        output_path = os.path.join(output_base_path, "searcher")

        index_path = self.index.get_index_path()
        anserini_fat_jar = Anserini.get_fat_jar()
        cmd = [
            "java",
            "-classpath",
            anserini_fat_jar,
            "-Xms512M",
            "-Xmx31G",
            "-Dapp.name=SearchCollection",
            "io.anserini.search.SearchCollection",
            "-topicreader",
            "TsvString",
            "-index",
            index_path,
            "-topics",
            topicsfn,
            "-output",
            output_path,
            "-inmem",
            "-threads",
            str(MAX_THREADS),
            "-stemmer",
            "none" if self.index.config["stemmer"] is None else self.index.config["stemmer"],
        ] + anserini_param_str.split()

        if self.index.config["indexstops"]:
            cmd += ["-keepStopwords"]

        logger.info("Anserini writing runs to %s", output_path)
        logger.debug(cmd)

        app = subprocess.Popen(cmd, stdout=subprocess.PIPE, universal_newlines=True)

        # Anserini output is verbose, so ignore DEBUG log lines and send other output through our logger
        for line in app.stdout:
            Anserini.filter_and_log_anserini_output(line, logger)

        app.wait()
        if app.returncode != 0:
            raise RuntimeError("command failed")

        with open(donefn, "wt") as donef:
            print("done", file=donef)


[docs]class PostprocessMixin:
    def _keep_topn(self, runs, topn):
        queries = sorted(list(runs.keys()), key=lambda k: int(k))
        for q in queries:
            docs = runs[q]
            if len(docs) <= topn:
                continue
            docs = sorted(docs.items(), key=lambda kv: kv[1], reverse=True)[:topn]
            runs[q] = {k: v for k, v in docs}
        return runs

[docs]    def filter(self, run_dir, docs_to_remove=None, docs_to_keep=None, topn=None):
        if (not docs_to_keep) and (not docs_to_remove):
            raise

        for fn in os.listdir(run_dir):
            if fn == "done":
                continue

            run_fn = os.path.join(run_dir, fn)
            self._filter(run_fn, docs_to_remove, docs_to_keep, topn)
        return run_dir

    def _filter(self, runfile, docs_to_remove, docs_to_keep, topn):
        runs = Searcher.load_trec_run(runfile)

        # filtering
        if docs_to_remove:  # prioritize docs_to_remove
            if isinstance(docs_to_remove, list):
                docs_to_remove = {q: docs_to_remove for q in runs}
            runs = {q: {d: v for d, v in docs.items() if d not in docs_to_remove.get(q, [])} for q, docs in runs.items()}
        elif docs_to_keep:
            if isinstance(docs_to_keep, list):
                docs_to_keep = {q: docs_to_keep for q in runs}
            runs = {q: {d: v for d, v in docs.items() if d in docs_to_keep[q]} for q, docs in runs.items()}

        if topn:
            runs = self._keep_topn(runs, topn)
        Searcher.write_trec_run(runs, runfile)  # overwrite runfile

[docs]    def dedup(self, run_dir, topn=None):
        for fn in os.listdir(run_dir):
            if fn == "done":
                continue
            run_fn = os.path.join(run_dir, fn)
            self._dedup(run_fn, topn)
        return run_dir

    def _dedup(self, runfile, topn):
        runs = Searcher.load_trec_run(runfile)
        new_runs = {q: {} for q in runs}

        # use the sum of each passage score as the document score, no sorting is done here
        for q, psg in runs.items():
            for pid, score in psg.items():
                docid = pid.split(".")[0]
                new_runs[q][docid] = max(new_runs[q].get(docid, -math.inf), score)
        runs = new_runs

        if topn:
            runs = self._keep_topn(runs, topn)
        Searcher.write_trec_run(runs, runfile)


@Searcher.register
[docs]class BM25(AnseriniSearcherMixIn, Searcher):
    """Anserini BM25. This searcher's parameters can also be specified as lists indicating parameters to grid search (e.g., ``"0.4,0.6,0.8,1.0"`` or ``"0.4..1,0.2"``)."""

[docs]    module_name = "BM25"
[docs]    config_spec = [
        ConfigOption("k1", 0.9, "controls term saturation", value_type="floatlist"),
        ConfigOption("b", 0.4, "controls document length normalization", value_type="floatlist"),
        ConfigOption("hits", 1000, "number of results to return"),
    ]

    def _query_from_file(self, topicsfn, output_path, config):
        """
        Runs BM25 search. Takes a query from the topic files, and fires it against the index
        Args:
            topicsfn: Path to a topics file
            output_path: Path where the results of the search (i.e the run file) should be stored

        Returns: Path to the run file where the results of the search are stored

        """
        bstr, k1str = list2str(config["b"], delimiter=" "), list2str(config["k1"], delimiter=" ")
        hits = config["hits"]
        anserini_param_str = f"-bm25 -bm25.b {bstr} -bm25.k1 {k1str} -hits {hits}"
        self._anserini_query_from_file(topicsfn, anserini_param_str, output_path)

        return output_path


@Searcher.register
[docs]class BM25Grid(AnseriniSearcherMixIn, Searcher):
    """Deprecated. BM25 with a grid search for k1 and b. Search is from 0.1 to bmax/k1max in 0.1 increments"""

[docs]    module_name = "BM25Grid"
[docs]    config_spec = [
        ConfigOption("k1max", 1.0, "maximum k1 value to include in grid search (starting at 0.1)"),
        ConfigOption("bmax", 1.0, "maximum b value to include in grid search (starting at 0.1)"),
        ConfigOption("hits", 1000, "number of results to return"),
    ]

    def _query_from_file(self, topicsfn, output_path, config):
        bs = np.around(np.arange(0.1, config["bmax"] + 0.1, 0.1), 1)
        k1s = np.around(np.arange(0.1, config["k1max"] + 0.1, 0.1), 1)
        bstr = " ".join(str(x) for x in bs)
        k1str = " ".join(str(x) for x in k1s)
        hits = config["hits"]
        anserini_param_str = f"-bm25 -bm25.b {bstr} -bm25.k1 {k1str} -hits {hits}"

        self._anserini_query_from_file(topicsfn, anserini_param_str, output_path)

        return output_path


@Searcher.register
[docs]class BM25RM3(AnseriniSearcherMixIn, Searcher):
    """Anserini BM25 with RM3 expansion. This searcher's parameters can also be specified as lists indicating parameters to grid search (e.g., ``"0.4,0.6,0.8,1.0"`` or ``"0.4..1,0.2"``)."""

[docs]    module_name = "BM25RM3"
[docs]    config_spec = [
        ConfigOption("k1", "0.9", "controls term saturation", value_type="floatlist"),
        ConfigOption("b", "0.4", "controls document length normalization", value_type="floatlist"),
        ConfigOption("fbTerms", [5, 25], "number of generated terms from feedback", value_type="intlist"),
        ConfigOption("fbDocs", [5, 10], "number of documents used for feedback", value_type="intlist"),
        ConfigOption("originalQueryWeight", [0.5], "the weight of unexpended query", value_type="floatlist"),
        ConfigOption("hits", 1000, "number of results to return"),
    ]

    def _query_from_file(self, topicsfn, output_path, config):
        hits = str(config["hits"])

        anserini_param_str = (
            "-rm3 "
            + " ".join(f"-rm3.{k} {list2str(config[k], ' ')}" for k in ["fbTerms", "fbDocs", "originalQueryWeight"])
            + " -bm25 "
            + " ".join(f"-bm25.{k} {list2str(config[k], ' ')}" for k in ["k1", "b"])
            + f" -hits {hits}"
        )
        self._anserini_query_from_file(topicsfn, anserini_param_str, output_path)

        return output_path


@Searcher.register
[docs]class BM25PostProcess(BM25, PostprocessMixin):
[docs]    module_name = "BM25Postprocess"

[docs]    config_spec = [
        ConfigOption("k1", 0.9, "controls term saturation", value_type="floatlist"),
        ConfigOption("b", 0.4, "controls document length normalization", value_type="floatlist"),
        ConfigOption("hits", 1000, "number of results expected from the core searcher"),
        ConfigOption("topn", 1000, "number of results expected after the filtering (if any)"),
        ConfigOption("dedup", False),
    ]

[docs]    def query_from_file(self, topicsfn, output_path, docs_to_remove=None):
        output_path = super().query_from_file(topicsfn, output_path)  # will call _query_from_file() from BM25

        if docs_to_remove:
            output_path = self.filter(output_path, docs_to_remove=docs_to_remove, topn=self.config["topn"])
        if self.config["dedup"]:
            output_path = self.dedup(output_path, topn=self.config["topn"])

        return output_path


[docs]class StaticRun(Searcher):
    def _query_from_file(self, topicsfn, output_path, config):
        import shutil

        outfn = os.path.join(output_path, "static.run")
        if not os.path.exists(outfn):
            os.makedirs(output_path, exist_ok=True)
            shutil.copy2(constants["PACKAGE_PATH"] / "data" / self.run_fn, outfn)

        return output_path

[docs]    def query(self, *args, **kwargs):
        raise NotImplementedError("this searcher uses a static run file, so it cannot handle new queries")


@Searcher.register
[docs]class StaticBM25RM3Rob04Yang19(StaticRun):
    """Tuned BM25+RM3 run used by Yang et al. in [1]. This should be used only with a benchmark using the same folds and queries.

    [1] Wei Yang, Kuang Lu, Peilin Yang, and Jimmy Lin. Critically Examining the "Neural Hype": Weak Baselines and  the Additivity of Effectiveness Gains from Neural Ranking Models. SIGIR 2019.
    """

[docs]    module_name = "bm25staticrob04yang19"
[docs]    run_fn = "rob04_yang19_rm3.run"


@Searcher.register
[docs]class StaticBM25RM3Rob04Yang19Desc(StaticRun):
    """Tuned BM25+RM3 robust04 description run on the folds used by Yang et al. in [1]. This should be used only with a benchmark using the same folds and queries.

    [1] Wei Yang, Kuang Lu, Peilin Yang, and Jimmy Lin. Critically Examining the "Neural Hype": Weak Baselines and  the Additivity of Effectiveness Gains from Neural Ranking Models. SIGIR 2019.
    """

[docs]    module_name = "bm25staticrob04yang19desc"
[docs]    run_fn = "rob04_yang19_desc_rm3.run"


@Searcher.register
[docs]class StaticBM25Rob04Huston14Title(StaticRun):
[docs]    module_name = "bm25staticrob04huston14title"
[docs]    run_fn = "rob04_huston14_title_rm3.run"


@Searcher.register
[docs]class StaticBM25Rob04Huston14Desc(StaticRun):
[docs]    module_name = "bm25staticrob04huston14desc"
[docs]    run_fn = "rob04_huston14_desc_rm3.run"


@Searcher.register
[docs]class StaticBM25Gov2(StaticRun):
[docs]    module_name = "bm25staticgov2"
[docs]    run_fn = "gov2_bm25.run"


@Searcher.register
[docs]class StaticBM25Gov2Desc(StaticRun):
[docs]    module_name = "bm25staticgov2desc"
[docs]    run_fn = "gov2_desc_bm25.run"


@Searcher.register
[docs]class StaticBM25Genomics(StaticRun):
[docs]    module_name = "bm25staticgenomics"
[docs]    run_fn = "genomics_bm25.run"


@Searcher.register
[docs]class StaticBM25CDS(StaticRun):
    """CDS BM25 run with k1=4.0, b=0.6 and new CDS 2016 documents removed from the 2014 and 2015 queries"""

[docs]    module_name = "bm25staticcds"
[docs]    run_fn = "cds_bm25.run"


@Searcher.register
[docs]class StaticCovidUdelAbstract(StaticRun):
[docs]    module_name = "qdelstaticcovidabstract"
[docs]    run_fn = "anserini.covid-r5.abstract.qdel.bm25-top1k.txt"


@Searcher.register
[docs]class StaticRM3TitleCore18(StaticRun):
[docs]    module_name = "rm3staticcore18title"
[docs]    run_fn = "core18_title_rm3.run"


@Searcher.register
[docs]class StaticRM3DescCore18(StaticRun):
[docs]    module_name = "rm3staticcore18desc"
[docs]    run_fn = "core18_desc_rm3.run"


@Searcher.register
[docs]class BM25PRF(AnseriniSearcherMixIn, Searcher):
    """Anserini BM25 PRF. This searcher's parameters can also be specified as lists indicating parameters to grid search (e.g., ``"0.4,0.6,0.8,1.0"`` or ``"0.4..1,0.2"``)."""

[docs]    module_name = "BM25PRF"
[docs]    config_spec = [
        ConfigOption("k1", [0.65, 0.70, 0.75], "controls term saturation", value_type="floatlist"),
        ConfigOption("b", [0.60, 0.7], "controls document length normalization", value_type="floatlist"),
        ConfigOption("fbTerms", [65, 70, 95, 100], "number of generated terms from feedback", value_type="intlist"),
        ConfigOption("fbDocs", [5, 10, 15], "number of documents used for feedback", value_type="intlist"),
        ConfigOption("newTermWeight", [0.2, 0.25], value_type="floatlist"),
        ConfigOption("hits", 1000, "number of results to return"),
    ]

    def _query_from_file(self, topicsfn, output_path, config):
        hits = str(config["hits"])

        anserini_param_str = (
            "-bm25prf "
            + " ".join(f"-bm25prf.{k} {list2str(config[k], ' ')}" for k in ["fbTerms", "fbDocs", "newTermWeight", "k1", "b"])
            + " -bm25 "
            + " ".join(f"-bm25.{k} {list2str(config[k], ' ')}" for k in ["k1", "b"])
            + f" -hits {hits}"
        )
        print(output_path)
        self._anserini_query_from_file(topicsfn, anserini_param_str, output_path)

        return output_path


@Searcher.register
[docs]class AxiomaticSemanticMatching(AnseriniSearcherMixIn, Searcher):
    """Anserini BM25 with Axiomatic query expansion. This searcher's parameters can also be specified as lists indicating parameters to grid search (e.g., ``"0.4,0.6,0.8,1.0"`` or ``"0.4..1,0.2"``)."""

[docs]    module_name = "axiomatic"
[docs]    config_spec = [
        ConfigOption("k1", 0.9, "controls term saturation", value_type="floatlist"),
        ConfigOption("b", 0.4, "controls document length normalization", value_type="floatlist"),
        ConfigOption("r", 20, value_type="intlist"),
        ConfigOption("n", 30, value_type="intlist"),
        ConfigOption("beta", 0.4, value_type="floatlist"),
        ConfigOption("top", 20, value_type="intlist"),
        ConfigOption("hits", 1000, "number of results to return"),
    ]

    def _query_from_file(self, topicsfn, output_path, config):
        hits = str(config["hits"])

        anserini_param_str = "-axiom -axiom.deterministic -axiom.r {0} -axiom.n {1} -axiom.beta {2} -axiom.top {3}".format(
            *[list2str(config[k], " ") for k in ["r", "n", "beta", "top"]]
        )
        anserini_param_str += " -bm25 -bm25.k1 {0} -bm25.b {1} ".format(*[list2str(config[k], " ") for k in ["k1", "b"]])
        anserini_param_str += f" -hits {hits}"
        self._anserini_query_from_file(topicsfn, anserini_param_str, output_path)

        return output_path


@Searcher.register
[docs]class DirichletQL(AnseriniSearcherMixIn, Searcher):
    """Anserini QL with Dirichlet smoothing. This searcher's parameters can also be specified as lists indicating parameters to grid search (e.g., ``"0.4,0.6,0.8,1.0"`` or ``"0.4..1,0.2"``)."""

[docs]    module_name = "DirichletQL"
[docs]    config_spec = [
        ConfigOption("mu", 1000, "smoothing parameter", value_type="intlist"),
        ConfigOption("hits", 1000, "number of results to return"),
    ]

    def _query_from_file(self, topicsfn, output_path, config):
        """
        Runs Dirichlet QL search. Takes a query from the topic files, and fires it against the index
        Args:
            topicsfn: Path to a topics file
            output_path: Path where the results of the search (i.e the run file) should be stored

        Returns: Path to the run file where the results of the search are stored

        """
        mustr = list2str(config["mu"], delimiter=" ")
        hits = config["hits"]
        anserini_param_str = f"-qld -qld.mu {mustr} -hits {hits}"
        self._anserini_query_from_file(topicsfn, anserini_param_str, output_path)

        return output_path


@Searcher.register
[docs]class QLJM(AnseriniSearcherMixIn, Searcher):
    """Anserini QL with Jelinek-Mercer smoothing. This searcher's parameters can also be specified as lists indicating parameters to grid search (e.g., ``"0.4,0.6,0.8,1.0"`` or ``"0.4..1,0.2"``)."""

[docs]    module_name = "QLJM"
[docs]    config_spec = [ConfigOption("lam", 0.1, value_type="floatlist"), ConfigOption("hits", 1000, "number of results to return")]

    def _query_from_file(self, topicsfn, output_path, config):
        anserini_param_str = "-qljm -qljm.lambda {0} -hits {1}".format(list2str(config["lam"], delimiter=" "), config["hits"])

        self._anserini_query_from_file(topicsfn, anserini_param_str, output_path)

        return output_path


@Searcher.register
[docs]class INL2(AnseriniSearcherMixIn, Searcher):
    """Anserini I(n)L2 scoring model. This searcher does not support list parameters."""

[docs]    module_name = "INL2"
[docs]    config_spec = [
        ConfigOption("c", 0.1),  # array input of this parameter is not support by anserini.SearchCollection
        ConfigOption("hits", 1000, "number of results to return"),
    ]

    def _query_from_file(self, topicsfn, output_path, config):
        anserini_param_str = "-inl2 -inl2.c {0} -hits {1}".format(config["c"], config["hits"])
        self._anserini_query_from_file(topicsfn, anserini_param_str, output_path)
        return output_path


@Searcher.register
[docs]class SPL(AnseriniSearcherMixIn, Searcher):
    """
    Anserini SPL scoring model. This searcher does not support list parameters.
    """

[docs]    module_name = "SPL"
[docs]    config_spec = [
        ConfigOption("c", 0.1),  # array input of this parameter is not support by anserini.SearchCollection
        ConfigOption("hits", 1000, "number of results to return"),
    ]

    def _query_from_file(self, topicsfn, output_path, config):
        anserini_param_str = "-spl -spl.c {0} -hits {1}".format(config["c"], config["hits"])

        self._anserini_query_from_file(topicsfn, anserini_param_str, output_path)

        return output_path


@Searcher.register
[docs]class F2Exp(AnseriniSearcherMixIn, Searcher):
    """
    F2Exp scoring model. This searcher does not support list parameters.
    """

[docs]    module_name = "F2Exp"
[docs]    config_spec = [
        ConfigOption("s", 0.5),  # array input of this parameter is not support by anserini.SearchCollection
        ConfigOption("hits", 1000, "number of results to return"),
    ]

    def _query_from_file(self, topicsfn, output_path, config):
        anserini_param_str = "-f2exp -f2exp.s {0} -hits {1}".format(config["s"], config["hits"])

        self._anserini_query_from_file(topicsfn, anserini_param_str, output_path)

        return output_path


@Searcher.register
[docs]class F2Log(AnseriniSearcherMixIn, Searcher):
    """
    F2Log scoring model. This searcher does not support list parameters.
    """

[docs]    module_name = "F2Log"
[docs]    config_spec = [
        ConfigOption("s", 0.5),  # array input of this parameter is not support by anserini.SearchCollection
        ConfigOption("hits", 1000, "number of results to return"),
    ]

    def _query_from_file(self, topicsfn, output_path, config):
        anserini_param_str = "-f2log -f2log.s {0} -hits {1}".format(config["s"], config["hits"])

        self._anserini_query_from_file(topicsfn, anserini_param_str, output_path)

        return output_path


@Searcher.register
[docs]class SDM(AnseriniSearcherMixIn, Searcher):
    """
    Anserini BM25 with the Sequential Dependency Model. This searcher supports list parameters for only k1 and b.
    """

[docs]    module_name = "SDM"
    # array input of (tw, ow, uw) is not support by anserini.SearchCollection
[docs]    config_spec = [
        ConfigOption("k1", 0.9, "controls term saturation", value_type="floatlist"),
        ConfigOption("b", 0.4, "controls document length normalization", value_type="floatlist"),
        ConfigOption("tw", 0.85, "term weight"),
        ConfigOption("ow", 0.15, "ordered window weight"),
        ConfigOption("uw", 0.05, "unordered window weight"),
        ConfigOption("hits", 1000, "number of results to return"),
    ]

    def _query_from_file(self, topicsfn, output_path, config):
        hits = config["hits"]
        anserini_param_str = "-sdm -sdm.tw {0} -sdm.ow {1} -sdm.uw {2}".format(*[config[k] for k in ["tw", "ow", "uw"]])
        anserini_param_str += " -bm25 -bm25.k1 {0} -bm25.b {1}".format(*[list2str(config[k], " ") for k in ["k1", "b"]])
        anserini_param_str += f" -hits {hits}"
        self._anserini_query_from_file(topicsfn, anserini_param_str, output_path)

        return output_path