import json
from capreolus import constants
from capreolus.utils.loginit import get_logger
from . import Collection, IRDCollection
[docs]logger = get_logger(__name__)
[docs]PACKAGE_PATH = constants["PACKAGE_PATH"]
@Collection.register
[docs]class NYT(IRDCollection):
"""New York Times collection. See https://catalog.ldc.upenn.edu/LDC2008T19"""
[docs] ird_dataset_name = "nyt"
[docs] collection_type = "JsonCollection"
[docs] def doc_as_json(self, doc):
content = " ".join((doc.headline, doc.body))
return json.dumps({"id": doc.doc_id, "contents": content})