# Путь до корневой папки проекта _относительно папки запуска этого скрипта_
import os
= os.path.abspath("../../") ABSOLUTE_PATH
Частотный анализ на базе TF-IDF
Частотный анализ на базе TF-IDF
- TF (term frequency) - отвечает за частоту встречаемости слова в текущем сообщении
- IDF (inverse document frequency) - отвечает за “редкость” слова, насколько слово редко встречается в других сообщениях
Перед запуском ноутбука запустите MLflow Server в консоли:
mlflow server --host 127.0.0.1 --port 8080
# Параметры для данного ноутбука (не нужны в общем конфиге)
= "TF/IDF Embeddings"
TASK_NAME = "Embedding_50K_v2"
EXPERIMENT_NAME = "tfidf_embeddings" # Часть пути, без пробелов и спецсимволов! RUN_NAME
Загружаем параметры из файла конфигурации
from hydra import compose, initialize
from hydra.core.global_hydra import GlobalHydra
from omegaconf import OmegaConf
GlobalHydra.instance().clear()
# Hydra context initialization
=None, config_path=".", job_name=TASK_NAME)
initialize(version_base= compose(config_name="config")
cfg
# print(OmegaConf.to_yaml(cfg))
Загружаем датасет
import os
import polars as pl
= pl.read_csv(
data
os.path.join(ABSOLUTE_PATH, cfg.paths.data, cfg.files.train_data),=False,
has_header=["Polarity", "Title", "Review"],
new_columns=cfg.params.nrows,
n_rows
)
100)
pl.Config.set_fmt_str_lengths(3) data.head(
shape: (3, 3)
Polarity | Title | Review |
---|---|---|
i64 | str | str |
2 | "Stuning even for the non-gamer" | "This sound track was beautiful! It paints the senery in your mind so well I would recomend it even t… |
2 | "The best soundtrack ever to anything." | "I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd w… |
2 | "Amazing!" | "This soundtrack is my favorite music of all time, hands down. The intense sadness of "Prisoners of F… |
= data.select('Polarity', 'Review').with_columns(
data "Polarity").map_elements(
pl.col(lambda polarity: "Negative" if polarity == 1 else "Positive"
) )
/tmp/ipykernel_1651/553745447.py:1: MapWithoutReturnDtypeWarning: Calling `map_elements` without specifying `return_dtype` can lead to unpredictable results. Specify `return_dtype` to silence this warning.
data = data.select('Polarity', 'Review').with_columns(
Проверяем, что датасет сбалансирован по отзывам
'Polarity'].value_counts() data[
shape: (2, 2)
Polarity | count |
---|---|
str | u32 |
"Positive" | 25506 |
"Negative" | 24494 |
Предобработка текста и лемматизатор
ВАЖНО: не использовать этот лемматизатор для русского языка, т.к. уничтожит падежи!
from nltk.corpus import stopwords
import nltk
import re
'wordnet')
nltk.download("stopwords")
nltk.download(
# Предварительная компиляция шаблонов регулярок - ускорение в ≈60 раз
= set(stopwords.words("english"))
stop_words = re.compile(r"https?://\S+|www\.\S+|\[.*?\]|[^a-zA-Z\s]+|\w*\d\w*")
url_pattern = re.compile("[0-9_-]+")
spec_chars_pattern = re.compile("[^a-zA-Z]+")
non_alpha_pattern
def preprocessing(input_text: str) -> str:
= input_text.lower() # приведение к нижнему регистру
text = url_pattern.sub("", text) # убираем ссылки
text = spec_chars_pattern.sub(" ", text) # убираем спец символы
text = non_alpha_pattern.sub(" ", text) # оставляем только буквы
text
= " ".join([word for word in text.split() if word not in stop_words])
text return text.strip()
= data.with_columns(
data 'Review').map_elements(preprocessing).str.split(' ').alias('corpus')
pl.col(
)
3) data.head(
[nltk_data] Downloading package wordnet to
[nltk_data] /teamspace/studios/this_studio/nltk_data...
[nltk_data] Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data] /teamspace/studios/this_studio/nltk_data...
[nltk_data] Package stopwords is already up-to-date!
sys:1: MapWithoutReturnDtypeWarning: Calling `map_elements` without specifying `return_dtype` can lead to unpredictable results. Specify `return_dtype` to silence this warning.
shape: (3, 3)
Polarity | Review | corpus |
---|---|---|
str | str | list[str] |
"Positive" | "This sound track was beautiful! It paints the senery in your mind so well I would recomend it even t… | ["sound", "track", … "listen"] |
"Positive" | "I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd w… | ["im", "reading", … "penny"] |
"Positive" | "This soundtrack is my favorite music of all time, hands down. The intense sadness of "Prisoners of F… | ["soundtrack", "favorite", … "stars"] |
from nltk.stem import WordNetLemmatizer
def lemmatize(input_frame: pl.DataFrame) -> pl.DataFrame:
= WordNetLemmatizer()
lemmatizer
return input_frame.with_columns(
"corpus").map_elements(
pl.col(lambda input_list: [lemmatizer.lemmatize(token) for token in input_list]
)
)
= lemmatize(data)
processed_data 3) processed_data.head(
/tmp/ipykernel_1651/2055772167.py:6: MapWithoutReturnDtypeWarning: Calling `map_elements` without specifying `return_dtype` can lead to unpredictable results. Specify `return_dtype` to silence this warning.
return input_frame.with_columns(
shape: (3, 3)
Polarity | Review | corpus |
---|---|---|
str | str | list[str] |
"Positive" | "This sound track was beautiful! It paints the senery in your mind so well I would recomend it even t… | ["sound", "track", … "listen"] |
"Positive" | "I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd w… | ["im", "reading", … "penny"] |
"Positive" | "This soundtrack is my favorite music of all time, hands down. The intense sadness of "Prisoners of F… | ["soundtrack", "favorite", … "star"] |
Разбиваем на тренировочную и тестовую выборки, получаем признаки для train и test
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
= {
vectorizer_params "max_features": cfg.params.max_features,
"analyzer": "word"
}
= TfidfVectorizer(**vectorizer_params)
tfidf_vectorizer
= train_test_split(
train, test
processed_data,=cfg.params.test_size,
test_size=cfg.params.shuffle,
shuffle=cfg.params.random_state,
random_state
)
"corpus"].to_pandas().astype(str))
tfidf_vectorizer.fit(train[
= tfidf_vectorizer.transform(train["corpus"].list.join(" ").to_numpy())
train_features = tfidf_vectorizer.transform(test["corpus"].list.join(" ").to_numpy()) test_features
Вспомогательный код для отрисовки Confusion матрицы
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.figure import Figure
from sklearn.metrics import (
ConfusionMatrixDisplay,
)
def conf_matrix(y_true: np.ndarray, pred: np.ndarray) -> Figure:
plt.ioff()= plt.subplots(figsize=(5, 5))
fig, ax =ax, colorbar=False)
ConfusionMatrixDisplay.from_predictions(y_true, pred, ax=90)
ax.xaxis.set_tick_params(rotation= ax.set_title(f"Confusion Matrix")
_
plt.tight_layout()return fig
Используем MLflow для логирования результатов обучения логистической регрессии
import mlflow
from sklearn.metrics import classification_report
# Подключение к серверу MLflow
"http://127.0.0.1:8080")
mlflow.set_tracking_uri(
# Зафиксируем текущее название эксперимента
mlflow.set_experiment(EXPERIMENT_NAME)
with mlflow.start_run(run_name=RUN_NAME) as run:
= {
model_params "random_state": cfg.params.random_state,
"multi_class": "multinomial",
"solver": "saga",
}
= LogisticRegression(**model_params)
model_lr "Polarity"])
model_lr.fit(train_features, train[
= model_lr.predict(test_features)
predicts
= classification_report(test["Polarity"], predicts, output_dict=True)
report
# логирование метрик из classification report
"accuracy", report.pop("accuracy"))
mlflow.log_metric(for class_or_avg, metrics_dict in report.items():
if class_or_avg == 'macro avg':
break
for metric, value in metrics_dict.items():
+ '_' + metric, value)
mlflow.log_metric(class_or_avg
# Логирование параметров модели
mlflow.log_params(vectorizer_params)
mlflow.log_params(model_params)
# Логирование целой модели для переиспользование
mlflow.sklearn.log_model(=model_lr,
sk_model=test_features[:10],
input_example=f"mlflow/{RUN_NAME}/model"
artifact_path
)
# Получение confusion matrix на два класса
= conf_matrix(test["Polarity"], predicts)
fig
# Логирование фигур или целых артефактов
f'{RUN_NAME}_confusion_matrix.png') mlflow.log_figure(fig,
2024/05/19 18:22:30 INFO mlflow.tracking.fluent: Experiment with name 'Embedding_50K_v2' does not exist. Creating a new experiment.