# Зависимые от задачи параметры, которые нецелесообразно выносить в конфиг
= "Bert Classifier"
TASK_NAME # BERT_MODEL_FILENAME = "clm_bert_model.pkl"
Эмбеддинг на базе Bert Transformer
Эмбеддинг на базе Bert Transformer
BERT (Bidirectional Encoder Representations from Transformers) - одна из наиболее известных и успешных моделей обработки естественного языка, основанная на архитектуре Трансформер.
# Загружаем переменные среды с секретами для ClearML
from dotenv import load_dotenv
load_dotenv()
True
Загружаем параметры из файла конфигурации
from hydra import compose, initialize
from hydra.core.global_hydra import GlobalHydra
from omegaconf import OmegaConf
GlobalHydra.instance().clear()
# Hydra context initialization
=None, config_path=".", job_name="tf-idf-classifier")
initialize(version_base= compose(config_name="config")
cfg
#print(OmegaConf.to_yaml(cfg))
Инициируем трекинг в CLearML
from clearml import Dataset, Task
# Инициируем трекинг в CLearML
= Task.init(
task =cfg.project.name,
project_name=TASK_NAME,
task_name=True
output_uri
)
# Подготовка локальной копии датасета
= Dataset.get(
dataset_path =cfg.dataset.project,
dataset_project=cfg.dataset.name,
dataset_name
).get_local_copy()
0) task.set_progress(
ClearML Task: created new task id=45554c2750e54c0bb5fd93f6ed64ffa7
2024-05-19 13:40:54,986 - clearml.Task - INFO - Storing jupyter notebook directly as code
ClearML results page: https://app.clear.ml/projects/632d87a87e714a7fbc73c21e83eebfa5/experiments/45554c2750e54c0bb5fd93f6ed64ffa7/output/log
2024-05-19 13:40:55,594 - clearml - INFO - Dataset.get() did not specify alias. Dataset information will not be automatically logged in ClearML Server.
Загружаем датасет
import os
import polars as pl
= pl.read_csv(
data file),
os.path.join(dataset_path, cfg.dataset.=False,
has_header=["Polarity", "Title", "Review"],
new_columns=cfg.params.nrows,
n_rows
)
100)
pl.Config.set_fmt_str_lengths(3) data.head(
shape: (3, 3)
Polarity | Title | Review |
---|---|---|
i64 | str | str |
2 | "Stuning even for the non-gamer" | "This sound track was beautiful! It paints the senery in your mind so well I would recomend it even t… |
2 | "The best soundtrack ever to anything." | "I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd w… |
2 | "Amazing!" | "This soundtrack is my favorite music of all time, hands down. The intense sadness of "Prisoners of F… |
# Фиксируем прогресс выполнения в CLearML
10) task.set_progress(
Проверяем, что датасет сбалансирован по отзывам
= data.select('Polarity', 'Review').with_columns(
data "Polarity").map_elements(
pl.col(lambda polarity: "Negative" if polarity == 1 else "Positive"
)
)
'Polarity'].value_counts() data[
/tmp/ipykernel_1833/3547166247.py:1: MapWithoutReturnDtypeWarning: Calling `map_elements` without specifying `return_dtype` can lead to unpredictable results. Specify `return_dtype` to silence this warning.
data = data.select('Polarity', 'Review').with_columns(
shape: (2, 2)
Polarity | count |
---|---|
str | u32 |
"Negative" | 24494 |
"Positive" | 25506 |
Разбиваем на тренировочную и тестовую выборки
from sklearn.model_selection import train_test_split
= train_test_split(
train, test
data,=cfg.params.test_size,
test_size=cfg.params.shuffle,
shuffle=cfg.params.random_state,
random_state )
Загружаем Bert модель и его токенайзер
import torch
from transformers import AutoModel, AutoTokenizer
= "bert-base-uncased"
model_name = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device
= AutoTokenizer.from_pretrained(model_name)
tokenizer = AutoModel.from_pretrained(model_name).to(device) bert_model
/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
warnings.warn(
# Проверим наличие графического процессора
device
device(type='cuda')
Загружаем батчами во избежание переполнения памяти
from torch.utils.data import DataLoader
= 128
fixed_batch_size = DataLoader(
train_dataloader "Review"].to_list(), batch_size=fixed_batch_size, shuffle=False
train[
)= DataLoader(
test_dataloader "Review"].to_list(), batch_size=fixed_batch_size, shuffle=False
test[ )
# Фиксируем прогресс выполнения в CLearML
20) task.set_progress(
[GPU] Инференс получаем как выход с последнего слоя Bert
def batch_inference(batch):
= tokenizer(
tokenized_batch =True, truncation=True, return_tensors="pt"
batch, padding
).to(device)with torch.no_grad():
= bert_model(**tokenized_batch)
hidden_batch = hidden_batch.last_hidden_state[:, 0, :].detach().to("cpu")
batch_embeddings return batch_embeddings
= torch.concat(
train_embeddings for batch_data in train_dataloader]
[batch_inference(batch_data)
)= torch.concat(
test_embeddings for batch_data in test_dataloader]
[batch_inference(batch_data) )
ClearML results page: https://app.clear.ml/projects/632d87a87e714a7fbc73c21e83eebfa5/experiments/45554c2750e54c0bb5fd93f6ed64ffa7/output/log
ClearML Monitor: Could not detect iteration reporting, falling back to iterations as seconds-from-start
# Фиксируем прогресс выполнения в CLearML
85)
task.set_progress(
# Фиксируем артефакты в ClearML
task.upload_artifact(="train_embeddings",
name=train_embeddings,
artifact_object
)
task.upload_artifact(="test_embeddings",
name=test_embeddings,
artifact_object )
True
Обучаем на этих эмбеддингах логистическую регрессию:
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
= {
model_params "random_state": cfg.params.random_state,
"multi_class": "multinomial",
"solver": "saga",
}
connect(model_params)
task.
= LogisticRegression(**model_params)
model_lr
"Polarity"]) model_lr.fit(train_embeddings, train[
LogisticRegression(multi_class='multinomial', random_state=42, solver='saga')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(multi_class='multinomial', random_state=42, solver='saga')
import pickle
# Фиксируем прогресс выполнения в CLearML
80)
task.set_progress(
# Фиксируем артефакты в ClearML
task.upload_artifact(='LogisticRegression',
name=pickle.dumps(model_lr)
artifact_object )
True
Получаем предсказание модели
= model_lr.predict(test_embeddings)
predicts
= classification_report(test["Polarity"], predicts, output_dict=True)
report
# Формируем confusion matrix на два класса
= confusion_matrix(test["Polarity"], predicts) conf_matrix
# Фиксируем прогресс выполнения в CLearML
95)
task.set_progress(
# Фиксируем параметры модели
= task.get_logger()
logger
"Accuracy", report.pop("accuracy"))
logger.report_single_value(
for class_name, metrics in report.items():
for metric, value in metrics.items():
f"{class_name}_{metric}", value)
logger.report_single_value(
"Confusion matrix", "ignored", matrix=conf_matrix) logger.report_confusion_matrix(
# Завершение эксперимента - обязательно!
task.close()
Контрольная визуализация матрицы
%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay
= plt.subplots(figsize=(5, 5))
fig, ax "Polarity"], predicts, ax=ax, colorbar=False)
ConfusionMatrixDisplay.from_predictions(test[=90)
ax.xaxis.set_tick_params(rotation= ax.set_title("Confusion Matrix")
_ plt.tight_layout()