Регистрируем провайдеров

import featureform as ff
from dotenv import load_dotenv
import os

load_dotenv()

POSTGRES_DATABASE = os.getenv('POSTGRES_DATABASE')
POSTGRES_USERNAME = os.getenv('POSTGRES_USERNAME')
POSTGRES_PASSWORD = os.getenv('POSTGRES_PASSWORD')

DATASET_TABLE = "street_tree_census_tree_data"
VARIANT = "preprocessing"

client = ff.Client(host="localhost:7878", insecure=True)

Регистрируем провайдеров

postgres = ff.register_postgres(
    name="postgres-data",
    host="host.docker.internal",  # The docker dns name for postgres
    port="5432",
    user=POSTGRES_USERNAME,
    password=POSTGRES_PASSWORD,
    database=POSTGRES_DATABASE,
)

redis = ff.register_redis(
    name="redis-data",
    host="host.docker.internal",  # The docker dns name for redis
    port=6379,
)

dataset = postgres.register_table(
    name=DATASET_TABLE,
    table=DATASET_TABLE,  # This is the table's name in Postgres
)

client.apply()
Applying Run: 2024-06-30t21-33-05
Creating provider postgres-data
Creating provider redis-data
Creating source street_tree_census_tree_data 2024-06-30t21-33-05
Looks like an equivalent source variant already exists, going to use its variant:  2024-06-30t12-53-23

@postgres.sql_transformation(inputs=[dataset])
def cleaning_dataset(dataset):
    # Запрос формирует новый датасет, включающий некоторые содержательные столбцы:
    # tree_id, block_id, tree_dbh, status, health, spc_common, latitude, longitude
    return (
        'SELECT tree_id, block_id, tree_dbh, status, health, spc_common, latitude, longitude '
        'FROM {{dataset}} '
        'WHERE tree_id IS NOT NULL AND block_id IS NOT NULL AND tree_dbh IS NOT NULL AND '
            'status IS NOT NULL AND health IS NOT NULL AND spc_common IS NOT NULL AND '
            'latitude IS NOT NULL AND longitude IS NOT NULL'
    )

data = client.dataframe(cleaning_dataset, limit=50)
data.head(3)
Applying Run: 2024-06-30t21-34-37
Creating source cleaning_dataset 2024-06-30t21-34-37
Looks like an equivalent source variant already exists, going to use its variant:  2024-06-30t21-23-28

"tree_id" "block_id" "tree_dbh" "status" "health" "spc_common" "latitude" "longitude"
0 180683 348711 3 Alive Fair red maple 40.723092 -73.844215
1 200540 315986 21 Alive Fair pin oak 40.794111 -73.818679
2 204026 218365 3 Alive Good honeylocust 40.717581 -73.936608

Заполнение данных

import featureform as ff

# @ff.entity
# class Example:
#   age = ff.MultiFeature(dataset, dataframe, variant="simple", include_columns=["age", "top_item"], entity_column="user", timestamp_column="timestamp", inference_store=redis)

@ff.entity
class PreparedData:
    # Register multiple columns from a dataset as features
    cleaned_data = ff.MultiFeature(
        cleaning_dataset,
        client.dataframe(cleaning_dataset, limit=10),
        entity_column="tree_id",
        include_columns=
            ["block_id", "tree_dbh", "status", "health", "spc_common", "latitude", "longitude"],
        inference_store=redis,
        # variant=VARIANT,
    )
    
    labeled = ff.Label(
        cleaning_dataset[["tree_id", "block_id"]], 
        type=ff.Bool,
    )

training_set = ff.register_training_set(
    "training_set",
    label=PreparedData.labeled,
    features=PreparedData.cleaned_data,
)

client.apply()
No resources to apply
Applying Run: 2024-06-30t21-37-08
Creating entity prepareddata
Creating feature health 2024-06-30t21-37-08
Creating feature longitude 2024-06-30t21-37-08
Creating feature latitude 2024-06-30t21-37-08
Creating feature block_id 2024-06-30t21-37-08
Creating feature status 2024-06-30t21-37-08
Creating feature spc_common 2024-06-30t21-37-08
Creating feature tree_dbh 2024-06-30t21-37-08
Creating label labeled 2024-06-30t21-37-08
Creating training-set training_set 2024-06-30t21-37-08