Source code for capreolus.benchmark.codesearchnet

import gzip
import pickle
import json
from collections import defaultdict
from pathlib import Path
from zipfile import ZipFile

from tqdm import tqdm

from . import Benchmark
from capreolus import constants, ConfigOption, Dependency
from capreolus.utils.loginit import get_logger
from capreolus.utils.trec import load_qrels, load_trec_topics, topic_to_trectxt
from capreolus.utils.common import download_file, remove_newline

[docs]logger = get_logger(__name__)
[docs]PACKAGE_PATH = constants["PACKAGE_PATH"]


[docs]@Benchmark.register
class CodeSearchNetCorpus(Benchmark):
    """CodeSearchNet Corpus. [1]

       [1] Hamel Husain, Ho-Hsiang Wu, Tiferet Gazit, Miltiadis Allamanis, and Marc Brockschmidt. 2019. CodeSearchNet Challenge: Evaluating the State of Semantic Code Search. arXiv 2019.
    """

[docs]    module_name = "codesearchnet_corpus"
[docs]    dependencies = [Dependency(key="collection", module="collection", name="codesearchnet")]
[docs]    url = "https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2"
[docs]    query_type = "title"

[docs]    file_fn = PACKAGE_PATH / "data" / "csn_corpus"

[docs]    qrel_dir = file_fn / "qrels"
[docs]    topic_dir = file_fn / "topics"
[docs]    fold_dir = file_fn / "folds"

[docs]    qidmap_dir = file_fn / "qidmap"
[docs]    docidmap_dir = file_fn / "docidmap"

[docs]    config_spec = [ConfigOption("lang", "ruby", "CSN language dataset to use")]

[docs]    def build(self):
        lang = self.config["lang"]

        self.qid_map_file = self.qidmap_dir / f"{lang}.json"
        self.docid_map_file = self.docidmap_dir / f"{lang}.json"

        self.qrel_file = self.qrel_dir / f"{lang}.txt"
        self.topic_file = self.topic_dir / f"{lang}.txt"
        self.fold_file = self.fold_dir / f"{lang}.json"

        for file in [var for var in vars(self) if var.endswith("file")]:
            getattr(self, file).parent.mkdir(exist_ok=True, parents=True)

        self.download_if_missing()

    @property
[docs]    def qid_map(self):
        if not hasattr(self, "_qid_map"):
            if not self.qid_map_file.exists():
                self.download_if_missing()

            self._qid_map = json.load(open(self.qid_map_file, "r"))
        return self._qid_map

    @property
[docs]    def docid_map(self):
        if not hasattr(self, "_docid_map"):
            if not self.docid_map_file.exists():
                self.download_if_missing()

            self._docid_map = json.load(open(self.docid_map_file, "r"))
        return self._docid_map

[docs]    def download_if_missing(self):
        files = [self.qid_map_file, self.docid_map_file, self.qrel_file, self.topic_file, self.fold_file]
        if all([f.exists() for f in files]):
            return

        lang = self.config["lang"]

        tmp_dir = Path("/tmp")
        zip_fn = tmp_dir / f"{lang}.zip"
        if not zip_fn.exists():
            download_file(f"{self.url}/{lang}.zip", zip_fn)

        with ZipFile(zip_fn, "r") as zipobj:
            zipobj.extractall(tmp_dir)

        # prepare docid-url mapping from dedup.pkl
        pkl_fn = tmp_dir / f"{lang}_dedupe_definitions_v2.pkl"
        doc_objs = pickle.load(open(pkl_fn, "rb"))
        self._docid_map = self._prep_docid_map(doc_objs)
        assert self._get_n_docid() == len(doc_objs)

        # prepare folds, qrels, topics, docstring2qid  # TODO: shall we add negative samples?
        qrels, self._qid_map = defaultdict(dict), {}
        qids = {s: [] for s in ["train", "valid", "test"]}

        topic_file = open(self.topic_file, "w", encoding="utf-8")
        qrel_file = open(self.qrel_file, "w", encoding="utf-8")

        def gen_doc_from_gzdir(dir):
            """ generate parsed dict-format doc from all jsonl.gz files under given directory """
            for fn in sorted(dir.glob("*.jsonl.gz")):
                f = gzip.open(fn, "rb")
                for doc in f:
                    yield json.loads(doc)

        for set_name in qids:
            set_path = tmp_dir / lang / "final" / "jsonl" / set_name
            for doc in gen_doc_from_gzdir(set_path):
                code = remove_newline(" ".join(doc["code_tokens"]))
                docstring = remove_newline(" ".join(doc["docstring_tokens"]))
                n_words_in_docstring = len(docstring.split())
                if n_words_in_docstring >= 1024:
                    logger.warning(
                        f"chunk query to first 1000 words otherwise TooManyClause would be triggered "
                        f"at lucene at search stage, "
                    )
                    docstring = " ".join(docstring.split()[:1020])  # for TooManyClause

                docid = self.get_docid(doc["url"], code)
                qid = self._qid_map.get(docstring, str(len(self._qid_map)))
                qrel_file.write(f"{qid} Q0 {docid} 1\n")

                if docstring not in self._qid_map:
                    self._qid_map[docstring] = qid
                    qids[set_name].append(qid)
                    topic_file.write(topic_to_trectxt(qid, docstring))

        topic_file.close()
        qrel_file.close()

        # write to qid_map.json, docid_map, fold.json
        json.dump(self._qid_map, open(self.qid_map_file, "w"))
        json.dump(self._docid_map, open(self.docid_map_file, "w"))
        json.dump(
            {"s1": {"train_qids": qids["train"], "predict": {"dev": qids["valid"], "test": qids["test"]}}},
            open(self.fold_file, "w"),
        )

    def _prep_docid_map(self, doc_objs):
        """
        construct a nested dict to map each doc into a unique docid
        which follows the structure: {url: {" ".join(code_tokens): docid, ...}}

        For all the lanugage datasets the url uniquely maps to a code_tokens yet it's not the case for but js and php
        which requires a second-level mapping from raw_doc to docid

        :param doc_objs: a list of dict having keys ["nwo", "url", "sha", "identifier", "arguments"
            "function", "function_tokens", "docstring", "doctring_tokens",],
        :return:
        """
        # TODO: any way to avoid the twice traversal of all url and make the return dict structure consistent
        lang = self.config["lang"]
        url2docid = defaultdict(dict)
        for i, doc in tqdm(enumerate(doc_objs), desc=f"Preparing the {lang} docid_map"):
            url, code_tokens = doc["url"], remove_newline(" ".join(doc["function_tokens"]))
            url2docid[url][code_tokens] = f"{lang}-FUNCTION-{i}"

        # remove the code_tokens for the unique url-docid mapping
        for url, docids in tqdm(url2docid.items(), desc=f"Compressing the {lang} docid_map"):
            url2docid[url] = list(docids.values()) if len(docids) == 1 else docids  # {code_tokens: docid} -> [docid]
        return url2docid

    def _get_n_docid(self):
        """ calculate the number of document ids contained in the nested docid map """
        lens = [len(docs) for url, docs in self._docid_map.items()]
        return sum(lens)

[docs]    def get_docid(self, url, code_tokens):
        """ retrieve the doc id according to the doc dict """
        docids = self.docid_map[url]
        return docids[0] if len(docids) == 1 else docids[code_tokens]


[docs]@Benchmark.register
class CodeSearchNetChallenge(Benchmark):
    """CodeSearchNet Challenge. [1]
       This benchmark can only be used for training (and challenge submissions) because no qrels are provided.

       [1] Hamel Husain, Ho-Hsiang Wu, Tiferet Gazit, Miltiadis Allamanis, and Marc Brockschmidt. 2019. CodeSearchNet Challenge: Evaluating the State of Semantic Code Search. arXiv 2019.
    """

[docs]    module_name = "codesearchnet_challenge"
[docs]    dependencies = [Dependency(key="collection", module="collection", name="codesearchnet")]
[docs]    config_spec = [ConfigOption("lang", "ruby", "CSN language dataset to use")]

[docs]    url = "https://raw.githubusercontent.com/github/CodeSearchNet/master/resources/queries.csv"
[docs]    query_type = "title"

[docs]    file_fn = PACKAGE_PATH / "data" / "csn_challenge"
[docs]    topic_file = file_fn / "topics.txt"
[docs]    qid_map_file = file_fn / "qidmap.json"

[docs]    def download_if_missing(self):
        """ download query.csv and prepare queryid - query mapping file """
        if self.topic_file.exists() and self.qid_map_file.exists():
            return

        tmp_dir = Path("/tmp")
        tmp_dir.mkdir(exist_ok=True, parents=True)
        self.file_fn.mkdir(exist_ok=True, parents=True)

        query_fn = tmp_dir / f"query.csv"
        if not query_fn.exists():
            download_file(self.url, query_fn)

        # prepare qid - query
        qid_map = {}
        topic_file = open(self.topic_file, "w", encoding="utf-8")
        query_file = open(query_fn)
        for qid, line in enumerate(query_file):
            if qid != 0:  # ignore the first line "query"
                topic_file.write(topic_to_trectxt(qid, line.strip()))
                qid_map[qid] = line
        topic_file.close()
        json.dump(qid_map, open(self.qid_map_file, "w"))