Source code for capreolus.tokenizer.bert

from transformers import BertTokenizer as HFBertTokenizer

from capreolus import ConfigOption

from . import Tokenizer


[docs]@Tokenizer.register class BertTokenizer(Tokenizer):
[docs] module_name = "berttokenizer"
[docs] config_spec = [ConfigOption("pretrained", "bert-base-uncased", "pretrained model to load vocab from")]
[docs] def build(self): self.bert_tokenizer = HFBertTokenizer.from_pretrained(self.config["pretrained"])
[docs] def convert_tokens_to_ids(self, tokens): return self.bert_tokenizer.convert_tokens_to_ids(tokens)
[docs] def tokenize(self, sentences): if not sentences or len(sentences) == 0: # either "" or [] return [] if isinstance(sentences, str): return self.bert_tokenizer.tokenize(sentences) return [self.bert_tokenizer.tokenize(s) for s in sentences]