Source code for dacy.subclasses.classification_transformer
"""
Functions for wrapping a sequence classification transformer in a SpaCy pipeline
"""
from typing import List, Callable, Iterable, Dict, Optional, Union
from pathlib import Path
import warnings
from spacy.language import Language
from spacy import util
from spacy.pipeline.pipe import deserialize_config
from spacy.tokens import Doc
from spacy.vocab import Vocab
from spacy_transformers import Transformer
from spacy_transformers.layers.transformer_model import forward, set_pytorch_transformer
from spacy_transformers.data_classes import (
FullTransformerBatch,
WordpieceBatch,
)
from spacy_transformers.annotation_setters import null_annotation_setter
from spacy_transformers.util import registry, huggingface_tokenize
from thinc.api import (
get_current_ops,
CupyOps,
Model,
Config,
)
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from ..utils import softmax
DEFAULT_CONFIG_STR = """
[classification_transformer]
max_batch_items = 4096
doc_extension_attribute = "clf_trf_data"
[classification_transformer.set_extra_annotations]
@annotation_setters = "spacy-transformers.null_annotation_setter.v1"
[classification_transformer.model]
@architectures = "dacy.ClassificationTransformerModel.v1"
name = "roberta-base"
tokenizer_config = {"use_fast": true}
num_labels = 2
[classification_transformer.model.get_spans]
@span_getters = "spacy-transformers.strided_spans.v1"
window = 128
stride = 96
"""
DEFAULT_CONFIG = Config().from_str(DEFAULT_CONFIG_STR)
[docs]@Language.factory(
"classification_transformer",
default_config=DEFAULT_CONFIG["classification_transformer"],
)
def make_classification_transformer(
nlp: Language,
name: str,
model: Model[List[Doc], FullTransformerBatch],
set_extra_annotations: Callable[[List[Doc], FullTransformerBatch], None],
max_batch_items: int,
doc_extension_attribute: str,
):
"""
Construct a Transformer component, which lets you plug a model from the
Huggingface transformers library into spaCy so you can use it in your
pipeline. One or more subsequent spaCy components can use the transformer
outputs as features in its model, with gradients backpropagated to the single
shared weights.
Args:
nlp (Language): a SpaCy text processing pipeline
name (str): The desired name of the component
model (Model[List[Doc], FullTransformerBatch]):
A thinc Model object wrapping the transformer. Usually you will want to use the TransformerModel layer for this.
set_extra_annotations (Callable[[List[Doc], FullTransformerBatch], None]):
A callback to set additional information onto the batch of `Doc` objects.
The doc._.clf_trf_data attribute is set prior to calling the callback. By default, no additional annotations are set.
max_batch_items (int): Max batch size
doc_extension_attribute (str): Your desired doc extension
Returns:
Your ClassificationTransformer component
"""
return ClassificationTransformer(
nlp.vocab,
model,
set_extra_annotations,
max_batch_items=max_batch_items,
name=name,
doc_extension_attribute=doc_extension_attribute,
)
[docs]@registry.architectures.register("dacy.ClassificationTransformerModel.v1")
def ClassificationTransformerModel(
name: str, get_spans: Callable, tokenizer_config: dict, num_labels: int
) -> Model[List[Doc], FullTransformerBatch]:
"""
Args:
get_spans (Callable[[List[Doc]], List[Span]]):
A function to extract spans from the batch of Doc objects.
This is used to manage long documents, by cutting them into smaller
sequences before running the transformer. The spans are allowed to
overlap, and you can also omit sections of the Doc if they are not
relevant.
tokenizer_config (dict): Settings to pass to the transformers tokenizer.
"""
return Model(
"classification_transformer",
forward,
init=init,
layers=[],
dims={"nO": None},
attrs={
"tokenizer": None,
"get_spans": get_spans,
"name": name,
"tokenizer_config": tokenizer_config,
"num_labels": num_labels,
"set_transformer": set_pytorch_transformer,
"has_transformer": False,
"flush_cache_chance": 0.0,
},
)
[docs]class ClassificationTransformer(Transformer):
""""""
def __init__(
self,
vocab: Vocab,
model: Model[List[Doc], FullTransformerBatch],
set_extra_annotations: Callable = null_annotation_setter,
*,
name: str = "classification_transformer",
max_batch_items: int = 128 * 32, # Max size of padded batch
doc_extension_attribute,
):
super().__init__(
vocab=vocab,
model=model,
set_extra_annotations=set_extra_annotations,
name=name,
max_batch_items=max_batch_items,
)
install_extensions(doc_extension_attribute)
self.doc_extension_attribute = doc_extension_attribute
[docs] def from_disk(
self,
path: Union[str, Path],
*,
num_labels: int,
exclude: Iterable[str] = tuple(),
) -> "Transformer":
"""Load the pipe from disk. For more see:
https://spacy.io/api/transformer#from_disk
Args:
path (str): Path to a directory.
exclude (Iterable[str]): String names of serialization fields to exclude.
num_labels (int): Number of labels of the models. Required for reading the model into memory.
Return:
(Transformer): The loaded object.
"""
def load_model(p):
p = Path(p).absolute()
tokenizer, transformer = huggingface_classification_from_pretrained(
p, self.model.attrs["tokenizer_config"], num_labels=num_labels
)
self.model.attrs["tokenizer"] = tokenizer
self.model.attrs["set_transformer"](self.model, transformer)
deserialize = {
"vocab": self.vocab.from_disk,
"cfg": lambda p: self.cfg.update(deserialize_config(p)),
"model": load_model,
}
util.from_disk(path, deserialize, exclude)
return self
[docs] def set_annotations(
self, docs: Iterable[Doc], predictions: FullTransformerBatch
) -> None:
"""
Assign the extracted features to the Doc objects. By default, the
TransformerData object is written to the doc._.trf_data attribute. Your
set_extra_annotations callback is then called, if provided. For more see
https://spacy.io/api/pipe#set_annotations
Args:
docs (Iterable[Doc]): The documents to modify.
predictions (FullTransformerBatch): A batch of activations.
"""
doc_data = list(predictions.doc_data)
for doc, data in zip(docs, doc_data):
setattr(doc._, self.doc_extension_attribute, data)
self.set_extra_annotations(docs, predictions)
[docs]def init(model: Model, X=None, Y=None):
if model.attrs["has_transformer"]:
return
name = model.attrs["name"]
tok_cfg = model.attrs["tokenizer_config"]
num_labels = model.attrs["num_labels"]
tokenizer, transformer = huggingface_classification_from_pretrained(
name, tok_cfg, num_labels
)
model.attrs["tokenizer"] = tokenizer
model.attrs["set_transformer"](model, transformer)
# Call the model with a batch of inputs to infer the width
texts = ["hello world", "foo bar"]
token_data = huggingface_tokenize(model.attrs["tokenizer"], texts)
wordpieces = WordpieceBatch.from_batch_encoding(token_data)
model.layers[0].initialize(X=wordpieces)
tensors = model.layers[0].predict(wordpieces)
[docs]def huggingface_classification_from_pretrained(
source: Union[Path, str], config: Dict, num_labels: int
):
"""
Create a Huggingface transformer model from pretrained weights. Will
download the model if it is not already downloaded.
Args:
source (Union[str, Path]): The name of the model or a path to it, such as
'bert-base-cased'.
config (dict): Settings to pass to the tokenizer.
"""
if hasattr(source, "absolute"):
str_path = str(source.absolute())
else:
str_path = source
tokenizer = AutoTokenizer.from_pretrained(str_path, **config)
transformer = AutoModelForSequenceClassification.from_pretrained(
str_path, num_labels=num_labels
)
ops = get_current_ops()
if isinstance(ops, CupyOps):
transformer.cuda()
return tokenizer, transformer
[docs]def make_classification_getter(category, labels, doc_extension):
def prop_getter(doc) -> dict:
trf_data = getattr(doc._, doc_extension)
if trf_data.tensors:
return {
"prop": softmax(trf_data.tensors[0][0]).round(decimals=3),
"labels": labels,
}
else:
warnings.warn(
"The tensors from the transformer forward pass is empty this is likely caused by an empty input string. Thus the model will return None"
)
return {
"prop": None,
"labels": labels,
}
def label_getter(doc) -> Optional[str]:
prop = getattr(doc._, f"{category}_prop")
if prop["prop"] is not None:
return labels[int(prop["prop"].argmax())]
else:
return None
return prop_getter, label_getter
[docs]def install_extensions(doc_extension_attribute) -> None:
if not Doc.has_extension(doc_extension_attribute):
Doc.set_extension(doc_extension_attribute, default=None)
[docs]def install_classification_extensions(
category: str,
labels: list,
doc_extension: str,
force: bool,
):
prop_getter, label_getter = make_classification_getter(
category, labels, doc_extension
)
Doc.set_extension(f"{category}_prop", getter=prop_getter, force=force)
Doc.set_extension(category, getter=label_getter, force=force)