# Параметры для данного ноутбука (не нужны в общем конфиге)
= "TF/IDF Classifier" TASK_NAME
Частотный анализ на базе TF-IDF
Частотный анализ на базе TF-IDF
- TF (term frequency) - отвечает за частоту встречаемости слова в текущем сообщении
- IDF (inverse document frequency) - отвечает за “редкость” слова, насколько слово редко встречается в других сообщениях
# Загружаем переменные среды с секретами для ClearML
from dotenv import load_dotenv
load_dotenv()
True
Загружаем параметры из файла конфигурации
from hydra import compose, initialize
from hydra.core.global_hydra import GlobalHydra
from omegaconf import OmegaConf
GlobalHydra.instance().clear()
# Hydra context initialization
=None, config_path=".", job_name=TASK_NAME)
initialize(version_base= compose(config_name="config")
cfg
# print(OmegaConf.to_yaml(cfg))
Инициируем трекинг в CLearML
from clearml import Dataset, Task
# Инициируем трекинг в CLearML
= Task.init(
task =cfg.project.name,
project_name=TASK_NAME,
task_name=True
output_uri
)
# Подготовка локальной копии датасета
= Dataset.get(
dataset_path =cfg.dataset.project,
dataset_project=cfg.dataset.name,
dataset_name
).get_local_copy()
0) task.set_progress(
ClearML Task: created new task id=6b29acc710494218bd58f58b7e0f5f25
2024-05-19 13:35:15,204 - clearml.Task - INFO - Storing jupyter notebook directly as code
ClearML results page: https://app.clear.ml/projects/632d87a87e714a7fbc73c21e83eebfa5/experiments/6b29acc710494218bd58f58b7e0f5f25/output/log
2024-05-19 13:35:15,844 - clearml - INFO - Dataset.get() did not specify alias. Dataset information will not be automatically logged in ClearML Server.
2024-05-19 13:35:16,486 - clearml.storage - INFO - Downloading: 82.82MB from https://files.clear.ml/Amazon%20reviews/.datasets/Raw%20data%20%28first%2050K%29/Raw%20data%20%28first%2050K%29.4ec23a435b794e8c8b2c564e7048708b/artifacts/data/dataset.4ec23a435b794e8c8b2c564e7048708b.b9jfc1mc.zip
2024-05-19 13:35:16,766 - clearml.storage - INFO - Downloaded 82.82 MB successfully from https://files.clear.ml/Amazon%20reviews/.datasets/Raw%20data%20%28first%2050K%29/Raw%20data%20%28first%2050K%29.4ec23a435b794e8c8b2c564e7048708b/artifacts/data/dataset.4ec23a435b794e8c8b2c564e7048708b.b9jfc1mc.zip , saved to /teamspace/studios/this_studio/.clearml/cache/storage_manager/datasets/08349fed5b7b1c5accf56d9dd9576e20.dataset.4ec23a435b794e8c8b2c564e7048708b.b9jfc1mc.zip
ClearML results page: https://app.clear.ml/projects/632d87a87e714a7fbc73c21e83eebfa5/experiments/6b29acc710494218bd58f58b7e0f5f25/output/log
ClearML Monitor: GPU monitoring failed getting GPU reading, switching off GPU monitoring
2024-05-19 13:35:18,179 - clearml - INFO - Dataset.get() did not specify alias. Dataset information will not be automatically logged in ClearML Server.
████████████████████████▎ 78% | 65.0/82.82 MB [00:00<00:00, 300.89MB/s]: /home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/tqdm/std.py:636: TqdmWarning: clamping frac to range [0, 1]
full_bar = Bar(frac,
███████████████████ 100% | 82.8218103790283/82.82 MB [00:00<00:00, 304.49MB/s]:
Загружаем датасет
import os
import polars as pl
= pl.read_csv(
data file),
os.path.join(dataset_path, cfg.dataset.=False,
has_header=["Polarity", "Title", "Review"],
new_columns=cfg.params.nrows,
n_rows
)
100)
pl.Config.set_fmt_str_lengths(3) data.head(
shape: (3, 3)
Polarity | Title | Review |
---|---|---|
i64 | str | str |
2 | "Stuning even for the non-gamer" | "This sound track was beautiful! It paints the senery in your mind so well I would recomend it even t… |
2 | "The best soundtrack ever to anything." | "I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd w… |
2 | "Amazing!" | "This soundtrack is my favorite music of all time, hands down. The intense sadness of "Prisoners of F… |
# Фиксируем прогресс выполнения в CLearML
10) task.set_progress(
Проверяем, что датасет сбалансирован по отзывам
= data.select('Polarity', 'Review').with_columns(
data "Polarity").map_elements(
pl.col(lambda polarity: "Negative" if polarity == 1 else "Positive"
)
)
'Polarity'].value_counts() data[
/tmp/ipykernel_2833/3547166247.py:1: MapWithoutReturnDtypeWarning: Calling `map_elements` without specifying `return_dtype` can lead to unpredictable results. Specify `return_dtype` to silence this warning.
data = data.select('Polarity', 'Review').with_columns(
shape: (2, 2)
Polarity | count |
---|---|
str | u32 |
"Positive" | 25506 |
"Negative" | 24494 |
Предобработка текста и лемматизатор
ВАЖНО: не использовать этот лемматизатор для русского языка, т.к. уничтожит падежи!
from nltk.corpus import stopwords
import nltk
import re
'wordnet')
nltk.download("stopwords")
nltk.download(
# Предварительная компиляция шаблонов регулярок - ускорение в ≈60 раз
= set(stopwords.words("english"))
stop_words = re.compile(r"https?://\S+|www\.\S+|\[.*?\]|[^a-zA-Z\s]+|\w*\d\w*")
url_pattern = re.compile("[0-9_-]+")
spec_chars_pattern = re.compile("[^a-zA-Z]+")
non_alpha_pattern
def preprocessing(input_text: str) -> str:
= input_text.lower() # приведение к нижнему регистру
text = url_pattern.sub("", text) # убираем ссылки
text = spec_chars_pattern.sub(" ", text) # убираем спец символы
text = non_alpha_pattern.sub(" ", text) # оставляем только буквы
text
= " ".join([word for word in text.split() if word not in stop_words])
text return text.strip()
= data.with_columns(
data 'Review').map_elements(preprocessing).str.split(' ').alias('corpus')
pl.col(
)
3) data.head(
[nltk_data] Downloading package wordnet to
[nltk_data] /teamspace/studios/this_studio/nltk_data...
[nltk_data] Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data] /teamspace/studios/this_studio/nltk_data...
[nltk_data] Package stopwords is already up-to-date!
sys:1: MapWithoutReturnDtypeWarning: Calling `map_elements` without specifying `return_dtype` can lead to unpredictable results. Specify `return_dtype` to silence this warning.
shape: (3, 3)
Polarity | Review | corpus |
---|---|---|
str | str | list[str] |
"Positive" | "This sound track was beautiful! It paints the senery in your mind so well I would recomend it even t… | ["sound", "track", … "listen"] |
"Positive" | "I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd w… | ["im", "reading", … "penny"] |
"Positive" | "This soundtrack is my favorite music of all time, hands down. The intense sadness of "Prisoners of F… | ["soundtrack", "favorite", … "stars"] |
from nltk.stem import WordNetLemmatizer
def lemmatize(input_frame: pl.DataFrame) -> pl.DataFrame:
= WordNetLemmatizer()
lemmatizer
return input_frame.with_columns(
"corpus").map_elements(
pl.col(lambda input_list: [lemmatizer.lemmatize(token) for token in input_list]
)
)
= lemmatize(data)
processed_data 3) processed_data.head(
/tmp/ipykernel_2833/2055772167.py:6: MapWithoutReturnDtypeWarning: Calling `map_elements` without specifying `return_dtype` can lead to unpredictable results. Specify `return_dtype` to silence this warning.
return input_frame.with_columns(
shape: (3, 3)
Polarity | Review | corpus |
---|---|---|
str | str | list[str] |
"Positive" | "This sound track was beautiful! It paints the senery in your mind so well I would recomend it even t… | ["sound", "track", … "listen"] |
"Positive" | "I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd w… | ["im", "reading", … "penny"] |
"Positive" | "This soundtrack is my favorite music of all time, hands down. The intense sadness of "Prisoners of F… | ["soundtrack", "favorite", … "star"] |
# Фиксируем прогресс выполнения в CLearML
20)
task.set_progress(
# Фиксируем артефакты в ClearML
task.upload_artifact(="Processed data",
name=processed_data
artifact_object )
True
Разбиваем на тренировочную и тестовую выборки, получаем признаки для train и test
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
= {
vectorizer_params "max_features": cfg.params.max_features,
"analyzer": "word"
}
= TfidfVectorizer(**vectorizer_params)
tfidf_vectorizer
= train_test_split(
train, test
processed_data,=cfg.params.test_size,
test_size=cfg.params.shuffle,
shuffle=cfg.params.random_state,
random_state
)
"corpus"].to_pandas().astype(str))
tfidf_vectorizer.fit(train[
= tfidf_vectorizer.transform(train["corpus"].list.join(" ").to_numpy())
train_features = tfidf_vectorizer.transform(test["corpus"].list.join(" ").to_numpy()) test_features
import pickle
# Фиксируем прогресс выполнения в CLearML
50)
task.set_progress(
# Фиксируем артефакты в ClearML
task.upload_artifact(='TfidfVectorizer',
name=pickle.dumps(tfidf_vectorizer) # Сериализованный .pkl
artifact_object
)
task.upload_artifact(="train_features",
name=(train_features, train["Polarity"].to_numpy()),
artifact_object
)
task.upload_artifact(="test_features",
name=(test_features, test["Polarity"].to_numpy()),
artifact_object )
True
Обучаем модель логистической регресии
from sklearn.linear_model import LogisticRegression
= {
model_params "random_state": cfg.params.random_state,
"multi_class": "multinomial",
"solver": "saga",
}
= LogisticRegression(**model_params)
model_lr
"Polarity"]) # train["Polarity"].to_numpy() model_lr.fit(train_features, train[
LogisticRegression(multi_class='multinomial', random_state=42, solver='saga')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(multi_class='multinomial', random_state=42, solver='saga')
# Фиксируем прогресс выполнения в CLearML
80)
task.set_progress(
# Фиксируем артефакты в ClearML
task.upload_artifact(='LogisticRegression',
name=pickle.dumps(model_lr)
artifact_object )
True
Получаем предсказание модели
from sklearn.metrics import classification_report, confusion_matrix
= model_lr.predict(test_features)
predicts
= classification_report(test["Polarity"], predicts, output_dict=True)
report
# Формируем confusion matrix на два класса
= confusion_matrix(test["Polarity"], predicts) conf_matrix
Финализируем результаты в ClearML
# Фиксируем прогресс выполнения в CLearML
95)
task.set_progress(
# Фиксируем параметры модели
= task.get_logger()
logger
"Accuracy", report.pop("accuracy"))
logger.report_single_value(
for class_name, metrics in report.items():
for metric, value in metrics.items():
f"{class_name}_{metric}", value)
logger.report_single_value(
"Confusion matrix", "ignored", matrix=conf_matrix) logger.report_confusion_matrix(
# Завершение эксперимента - обязательно!
task.close()
Контрольная визуализация матрицы
%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay
= plt.subplots(figsize=(5, 5))
fig, ax "Polarity"], predicts, ax=ax, colorbar=False)
ConfusionMatrixDisplay.from_predictions(test[=90)
ax.xaxis.set_tick_params(rotation= ax.set_title("Confusion Matrix")
_ plt.tight_layout()