Skip to content

SentenceTransformers

formed.integrations.sentence_transformers.analyzers

SentenceTransformerAnalyzer dataclass

SentenceTransformerAnalyzer(
    model_name_or_path, unicode_normalization=None
)

model_name_or_path instance-attribute

model_name_or_path

unicode_normalization class-attribute instance-attribute

unicode_normalization = None

formed.integrations.sentence_transformers.utils

load_sentence_transformer cached

load_sentence_transformer(model_name_or_path, **kwargs)
Source code in src/formed/integrations/sentence_transformers/utils.py
10
11
12
13
14
15
16
17
@lru_cache(maxsize=8)
def load_sentence_transformer(
    model_name_or_path: str | PathLike,
    **kwargs: Any,
) -> SentenceTransformer:
    with suppress(FileNotFoundError):
        model_name_or_path = minato.cached_path(model_name_or_path)
    return SentenceTransformer(str(model_name_or_path), **kwargs)

formed.integrations.sentence_transformers.workflow

Workflow steps for Sentence Transformers integration.

This module provides workflow steps for loading, training, and converting sentence transformer models.

Available Steps
  • sentence_transformers::load: Load a pre-trained sentence transformer model.
  • sentence_transformers::train: Train a sentence transformer model.
  • sentence_transformers::convert_tokenizer: Convert a sentence transformer tokenizer to a formed Tokenizer (requires ml integration).

SentenceTransformerFormat

Bases: Generic[SentenceTransformerT], Format[SentenceTransformerT]

identifier property

identifier

Get the unique identifier for this format.

RETURNS DESCRIPTION
str

Format identifier string.

write

write(artifact, directory)
Source code in src/formed/integrations/sentence_transformers/workflow.py
42
43
def write(self, artifact: SentenceTransformerT, directory: Path) -> None:
    artifact.save_pretrained(str(directory / "model"))

read

read(directory)
Source code in src/formed/integrations/sentence_transformers/workflow.py
45
46
def read(self, directory: Path) -> SentenceTransformerT:
    return cast(SentenceTransformerT, SentenceTransformer(str(directory / "model")))

is_default_of classmethod

is_default_of(obj)

Check if this format is the default for the given object type.

PARAMETER DESCRIPTION
obj

Object to check.

TYPE: Any

RETURNS DESCRIPTION
bool

True if this format should be used by default for this type.

Source code in src/formed/workflow/format.py
101
102
103
104
105
106
107
108
109
110
111
112
@classmethod
def is_default_of(cls, obj: Any) -> bool:
    """Check if this format is the default for the given object type.

    Args:
        obj: Object to check.

    Returns:
        True if this format should be used by default for this type.

    """
    return False

load_pretrained_model

load_pretrained_model(model_name_or_path, **kwargs)

Load a pre-trained sentence transformer model.

PARAMETER DESCRIPTION
model_name_or_path

Model identifier or path to model directory.

TYPE: str | PathLike

**kwargs

Additional arguments to pass to SentenceTransformer constructor.

TYPE: Any DEFAULT: {}

RETURNS DESCRIPTION
SentenceTransformer

Loaded SentenceTransformer model.

Source code in src/formed/integrations/sentence_transformers/workflow.py
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
@step("sentence_transformers::load", cacheable=False)
def load_pretrained_model(
    model_name_or_path: str | PathLike,
    **kwargs: Any,
) -> SentenceTransformer:
    """Load a pre-trained sentence transformer model.

    Args:
        model_name_or_path: Model identifier or path to model directory.
        **kwargs: Additional arguments to pass to SentenceTransformer constructor.

    Returns:
        Loaded SentenceTransformer model.
    """
    with suppress(Exception):
        model_name_or_path = minato.cached_path(model_name_or_path)
    return SentenceTransformer(str(model_name_or_path), **kwargs)

train_sentence_transformer

train_sentence_transformer(
    model,
    loss,
    args,
    dataset=None,
    loss_modifier=None,
    data_collator=None,
    tokenizer=None,
    evaluator=None,
    callbacks=None,
    model_init=None,
    compute_metrics=None,
    optimizers=(None, None),
    preprocess_logits_for_metrics=None,
    train_dataset_key="train",
    eval_dataset_key="validation",
)

Train a sentence transformer model.

This step trains a SentenceTransformer model using the provided loss function, datasets, and training arguments.

PARAMETER DESCRIPTION
model

SentenceTransformer model to train.

TYPE: SentenceTransformer

loss

Loss function(s) for training (single or mapping by dataset key).

TYPE: Mapping[str, Lazy[Module]] | Lazy[Module]

args

Training arguments configuration.

TYPE: Lazy[SentenceTransformerTrainingArguments]

dataset

Training/validation datasets.

TYPE: None | (Dataset | DatasetDict | Mapping[str, Dataset | DatasetDict]) DEFAULT: None

loss_modifier

Optional modifier(s) to apply to the loss function.

TYPE: None | (Mapping[str, list[Lazy[Module]] | Lazy[Module]] | list[Lazy[Module]] | Lazy[Module]) DEFAULT: None

data_collator

Optional data collator for batching.

TYPE: DataCollator | None DEFAULT: None

tokenizer

Optional tokenizer.

TYPE: PreTrainedTokenizerBase | None DEFAULT: None

evaluator

Optional evaluator(s) for validation.

TYPE: SentenceEvaluator | list[SentenceEvaluator] | None DEFAULT: None

callbacks

Optional training callbacks.

TYPE: list[TrainerCallback] | None DEFAULT: None

model_init

Optional model initialization function.

TYPE: Callable[[], SentenceTransformer] | None DEFAULT: None

compute_metrics

Optional metrics computation function.

TYPE: Callable[[EvalPrediction], dict] | None DEFAULT: None

optimizers

Optional optimizer and learning rate scheduler.

TYPE: tuple[Lazy[Optimizer] | None, Lazy[LambdaLR] | None] DEFAULT: (None, None)

preprocess_logits_for_metrics

Optional logits preprocessing function.

TYPE: Callable[[Tensor, Tensor], Tensor] | None DEFAULT: None

train_dataset_key

Key for training dataset split.

TYPE: str DEFAULT: 'train'

eval_dataset_key

Key for evaluation dataset split.

TYPE: str DEFAULT: 'validation'

RETURNS DESCRIPTION
SentenceTransformer

Trained SentenceTransformer model.

Source code in src/formed/integrations/sentence_transformers/workflow.py
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
@step("sentence_transformers::train", format=SentenceTransformerFormat())
def train_sentence_transformer(
    model: SentenceTransformer,
    loss: Mapping[str, Lazy[torch.nn.Module]] | Lazy[torch.nn.Module],
    args: Lazy[SentenceTransformerTrainingArguments],
    dataset: None
    | (
        datasets.Dataset
        | datasets.DatasetDict
        | Mapping[
            str,
            datasets.Dataset | datasets.DatasetDict,
        ]
    ) = None,
    loss_modifier: None
    | (
        Mapping[str, list[Lazy[torch.nn.Module]] | Lazy[torch.nn.Module]]
        | list[Lazy[torch.nn.Module]]
        | Lazy[torch.nn.Module]
    ) = None,
    data_collator: DataCollator | None = None,  # pyright: ignore[reportInvalidTypeForm]
    tokenizer: PreTrainedTokenizerBase | None = None,
    evaluator: SentenceEvaluator | list[SentenceEvaluator] | None = None,
    callbacks: list[TrainerCallback] | None = None,
    model_init: Callable[[], SentenceTransformer] | None = None,
    compute_metrics: Callable[[EvalPrediction], dict] | None = None,
    optimizers: tuple[
        Lazy[torch.optim.Optimizer] | None,
        Lazy[torch.optim.lr_scheduler.LambdaLR] | None,
    ] = (None, None),
    preprocess_logits_for_metrics: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] | None = None,
    train_dataset_key: str = "train",
    eval_dataset_key: str = "validation",
) -> SentenceTransformer:
    """Train a sentence transformer model.

    This step trains a SentenceTransformer model using the provided loss function,
    datasets, and training arguments.

    Args:
        model: SentenceTransformer model to train.
        loss: Loss function(s) for training (single or mapping by dataset key).
        args: Training arguments configuration.
        dataset: Training/validation datasets.
        loss_modifier: Optional modifier(s) to apply to the loss function.
        data_collator: Optional data collator for batching.
        tokenizer: Optional tokenizer.
        evaluator: Optional evaluator(s) for validation.
        callbacks: Optional training callbacks.
        model_init: Optional model initialization function.
        compute_metrics: Optional metrics computation function.
        optimizers: Optional optimizer and learning rate scheduler.
        preprocess_logits_for_metrics: Optional logits preprocessing function.
        train_dataset_key: Key for training dataset split.
        eval_dataset_key: Key for evaluation dataset split.

    Returns:
        Trained SentenceTransformer model.
    """
    workdir = use_step_workdir()

    args_ = args.construct(output_dir=str(workdir))

    if isinstance(dataset, datasets.Dataset):
        train_dataset = dataset
        eval_dataset = None
    else:
        train_dataset = dataset.get(train_dataset_key) if dataset and args_.do_train else None
        eval_dataset = dataset.get(eval_dataset_key) if dataset and args_.do_eval else None

    loss_: torch.nn.Module | dict[str, torch.nn.Module]
    if isinstance(loss, Mapping):
        loss_ = {k: ll.construct(model=model) for k, ll in loss.items()}
    else:
        loss_ = loss.construct(model=model)
    if loss_modifier:
        if isinstance(loss_modifier, Mapping):
            assert isinstance(loss_, dict)
            for k, m in loss_modifier.items():
                if not isinstance(m, list):
                    m = [m]
                for n in m:
                    loss_[k] = n.construct(model=model, loss=loss_[k])
        else:
            if not isinstance(loss_modifier, list):
                loss_modifier = [loss_modifier]
            if isinstance(loss_, dict):
                for k, ll in loss_.items():
                    for m in loss_modifier:
                        loss_[k] = m.construct(model=model, loss=ll)
            else:
                for m in loss_modifier:
                    loss_ = m.construct(model=model, loss=loss_)

    lazy_optimizer, lazy_lr_scheduler = optimizers
    optimizer = lazy_optimizer.construct(params=model.parameters()) if lazy_optimizer else None
    lr_scheduler = lazy_lr_scheduler.construct(optimizer=optimizer) if lazy_lr_scheduler else None

    trainer = SentenceTransformerTrainer(
        model=model,
        loss=loss_,
        args=args_,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
        evaluator=evaluator,
        callbacks=callbacks,
        model_init=model_init,
        compute_metrics=compute_metrics,
        optimizers=(optimizer, lr_scheduler),  # type: ignore[arg-type]
        preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    )
    trainer.train()
    return model

convert_tokenizer

convert_tokenizer(
    model_name_or_path,
    pad_token=VALUE,
    unk_token=VALUE,
    bos_token=VALUE,
    eos_token=VALUE,
    freeze=True,
    accessor=None,
    characters=None,
    text_vector=None,
    token_vectors=None,
)

Convert a sentence transformer model's tokenizer to a formed Tokenizer.

This step extracts the tokenizer from a sentence transformer model and converts it into a formed Tokenizer with specified special tokens.

PARAMETER DESCRIPTION
model_name_or_path

Model identifier or path to model directory.

TYPE: str | PathLike

pad_token

Padding token (uses model default if not specified).

TYPE: str | None | NotSpecified DEFAULT: VALUE

unk_token

Unknown token (uses model default if not specified).

TYPE: str | None | NotSpecified DEFAULT: VALUE

bos_token

Beginning-of-sequence token (uses model default if not specified).

TYPE: str | None | NotSpecified DEFAULT: VALUE

eos_token

End-of-sequence token (uses model default if not specified).

TYPE: str | None | NotSpecified DEFAULT: VALUE

freeze

Whether to freeze the vocabulary.

TYPE: bool DEFAULT: True

accessor

Optional accessor for token extraction.

TYPE: str | Callable | None DEFAULT: None

RETURNS DESCRIPTION
Tokenizer

Converted formed Tokenizer.

RAISES DESCRIPTION
AssertionError

If pad_token is not specified and not available in the model.

Source code in src/formed/integrations/sentence_transformers/workflow.py
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
@step("sentence_transformers::convert_tokenizer", format="json")
def convert_tokenizer(
    model_name_or_path: str | PathLike,
    pad_token: str | None | NotSpecified = NotSpecified.VALUE,
    unk_token: str | None | NotSpecified = NotSpecified.VALUE,
    bos_token: str | None | NotSpecified = NotSpecified.VALUE,
    eos_token: str | None | NotSpecified = NotSpecified.VALUE,
    freeze: bool = True,
    accessor: str | Callable | None = None,
    characters: TokenCharactersIndexer | None = None,
    text_vector: TensorTransform | None = None,
    token_vectors: TensorSequenceTransform | None = None,
) -> Tokenizer:
    """Convert a sentence transformer model's tokenizer to a formed Tokenizer.

    This step extracts the tokenizer from a sentence transformer model and
    converts it into a formed Tokenizer with specified special tokens.

    Args:
        model_name_or_path: Model identifier or path to model directory.
        pad_token: Padding token (uses model default if not specified).
        unk_token: Unknown token (uses model default if not specified).
        bos_token: Beginning-of-sequence token (uses model default if not specified).
        eos_token: End-of-sequence token (uses model default if not specified).
        freeze: Whether to freeze the vocabulary.
        accessor: Optional accessor for token extraction.

    Returns:
        Converted formed Tokenizer.

    Raises:
        AssertionError: If pad_token is not specified and not available in the model.
    """
    model = load_sentence_transformer(model_name_or_path)

    def get_token(given: str | None | NotSpecified, default: Any) -> str | None:
        if not isinstance(given, NotSpecified):
            return given
        if isinstance(default, str):
            return default
        return None

    vocab = model.tokenizer.get_vocab().copy()
    pad_token = get_token(pad_token, getattr(model.tokenizer, "pad_token", None))
    unk_token = get_token(unk_token, getattr(model.tokenizer, "unk_token", None))
    bos_token = get_token(bos_token, getattr(model.tokenizer, "bos_token", None))
    eos_token = get_token(eos_token, getattr(model.tokenizer, "eos_token", None))

    assert isinstance(pad_token, str), "pad_token must be specified or available in the tokenizer"

    surface_indexer = TokenSequenceIndexer(
        vocab=vocab,
        pad_token=pad_token,
        unk_token=unk_token,
        bos_token=bos_token,
        eos_token=eos_token,
        freeze=freeze,
    )
    analyzer = SentenceTransformerAnalyzer(model_name_or_path)
    return Tokenizer(
        surfaces=surface_indexer,
        characters=characters,
        text_vector=text_vector,
        token_vectors=token_vectors,
        analyzer=Param.cast(analyzer),
        accessor=accessor,
    )