import featureform as ff
from dotenv import load_dotenv
import os
load_dotenv()
= os.getenv('POSTGRES_DATABASE')
POSTGRES_DATABASE = os.getenv('POSTGRES_USERNAME')
POSTGRES_USERNAME = os.getenv('POSTGRES_PASSWORD')
POSTGRES_PASSWORD
= "street_tree_census_tree_data"
DATASET_TABLE = "preprocessing"
VARIANT
= ff.Client(host="localhost:7878", insecure=True) client
Регистрируем провайдеров
Регистрируем провайдеров
= ff.register_postgres(
postgres ="postgres-data",
name="host.docker.internal", # The docker dns name for postgres
host="5432",
port=POSTGRES_USERNAME,
user=POSTGRES_PASSWORD,
password=POSTGRES_DATABASE,
database
)
= ff.register_redis(
redis ="redis-data",
name="host.docker.internal", # The docker dns name for redis
host=6379,
port
)
= postgres.register_table(
dataset =DATASET_TABLE,
name=DATASET_TABLE, # This is the table's name in Postgres
table
)
apply() client.
Applying Run: 2024-06-30t21-33-05
Creating provider postgres-data
Creating provider redis-data
Creating source street_tree_census_tree_data 2024-06-30t21-33-05
Looks like an equivalent source variant already exists, going to use its variant: 2024-06-30t12-53-23
@postgres.sql_transformation(inputs=[dataset])
def cleaning_dataset(dataset):
# Запрос формирует новый датасет, включающий некоторые содержательные столбцы:
# tree_id, block_id, tree_dbh, status, health, spc_common, latitude, longitude
return (
'SELECT tree_id, block_id, tree_dbh, status, health, spc_common, latitude, longitude '
'FROM {{dataset}} '
'WHERE tree_id IS NOT NULL AND block_id IS NOT NULL AND tree_dbh IS NOT NULL AND '
'status IS NOT NULL AND health IS NOT NULL AND spc_common IS NOT NULL AND '
'latitude IS NOT NULL AND longitude IS NOT NULL'
)
= client.dataframe(cleaning_dataset, limit=50)
data 3) data.head(
Applying Run: 2024-06-30t21-34-37
Creating source cleaning_dataset 2024-06-30t21-34-37
Looks like an equivalent source variant already exists, going to use its variant: 2024-06-30t21-23-28
"tree_id" | "block_id" | "tree_dbh" | "status" | "health" | "spc_common" | "latitude" | "longitude" | |
---|---|---|---|---|---|---|---|---|
0 | 180683 | 348711 | 3 | Alive | Fair | red maple | 40.723092 | -73.844215 |
1 | 200540 | 315986 | 21 | Alive | Fair | pin oak | 40.794111 | -73.818679 |
2 | 204026 | 218365 | 3 | Alive | Good | honeylocust | 40.717581 | -73.936608 |
Заполнение данных
import featureform as ff
# @ff.entity
# class Example:
# age = ff.MultiFeature(dataset, dataframe, variant="simple", include_columns=["age", "top_item"], entity_column="user", timestamp_column="timestamp", inference_store=redis)
@ff.entity
class PreparedData:
# Register multiple columns from a dataset as features
= ff.MultiFeature(
cleaned_data
cleaning_dataset,=10),
client.dataframe(cleaning_dataset, limit="tree_id",
entity_column=
include_columns"block_id", "tree_dbh", "status", "health", "spc_common", "latitude", "longitude"],
[=redis,
inference_store# variant=VARIANT,
)
= ff.Label(
labeled "tree_id", "block_id"]],
cleaning_dataset[[type=ff.Bool,
)
= ff.register_training_set(
training_set "training_set",
=PreparedData.labeled,
label=PreparedData.cleaned_data,
features
)
apply() client.
No resources to apply
Applying Run: 2024-06-30t21-37-08
Creating entity prepareddata
Creating feature health 2024-06-30t21-37-08
Creating feature longitude 2024-06-30t21-37-08
Creating feature latitude 2024-06-30t21-37-08
Creating feature block_id 2024-06-30t21-37-08
Creating feature status 2024-06-30t21-37-08
Creating feature spc_common 2024-06-30t21-37-08
Creating feature tree_dbh 2024-06-30t21-37-08
Creating label labeled 2024-06-30t21-37-08
Creating training-set training_set 2024-06-30t21-37-08