import featureform as ff
from dotenv import load_dotenv
import os
load_dotenv()
POSTGRES_DATABASE = os.getenv('POSTGRES_DATABASE')
POSTGRES_USERNAME = os.getenv('POSTGRES_USERNAME')
POSTGRES_PASSWORD = os.getenv('POSTGRES_PASSWORD')
DATASET_TABLE = "street_tree_census_tree_data"
VARIANT = "preprocessing"
client = ff.Client(host="localhost:7878", insecure=True)Регистрируем провайдеров
Регистрируем провайдеров
postgres = ff.register_postgres(
    name="postgres-data",
    host="host.docker.internal",  # The docker dns name for postgres
    port="5432",
    user=POSTGRES_USERNAME,
    password=POSTGRES_PASSWORD,
    database=POSTGRES_DATABASE,
)
redis = ff.register_redis(
    name="redis-data",
    host="host.docker.internal",  # The docker dns name for redis
    port=6379,
)
dataset = postgres.register_table(
    name=DATASET_TABLE,
    table=DATASET_TABLE,  # This is the table's name in Postgres
)
client.apply()Applying Run: 2024-06-30t21-33-05
Creating provider postgres-data
Creating provider redis-data
Creating source street_tree_census_tree_data 2024-06-30t21-33-05
Looks like an equivalent source variant already exists, going to use its variant:  2024-06-30t12-53-23
@postgres.sql_transformation(inputs=[dataset])
def cleaning_dataset(dataset):
    # Запрос формирует новый датасет, включающий некоторые содержательные столбцы:
    # tree_id, block_id, tree_dbh, status, health, spc_common, latitude, longitude
    return (
        'SELECT tree_id, block_id, tree_dbh, status, health, spc_common, latitude, longitude '
        'FROM {{dataset}} '
        'WHERE tree_id IS NOT NULL AND block_id IS NOT NULL AND tree_dbh IS NOT NULL AND '
            'status IS NOT NULL AND health IS NOT NULL AND spc_common IS NOT NULL AND '
            'latitude IS NOT NULL AND longitude IS NOT NULL'
    )
data = client.dataframe(cleaning_dataset, limit=50)
data.head(3)Applying Run: 2024-06-30t21-34-37
Creating source cleaning_dataset 2024-06-30t21-34-37
Looks like an equivalent source variant already exists, going to use its variant:  2024-06-30t21-23-28
| "tree_id" | "block_id" | "tree_dbh" | "status" | "health" | "spc_common" | "latitude" | "longitude" | |
|---|---|---|---|---|---|---|---|---|
| 0 | 180683 | 348711 | 3 | Alive | Fair | red maple | 40.723092 | -73.844215 | 
| 1 | 200540 | 315986 | 21 | Alive | Fair | pin oak | 40.794111 | -73.818679 | 
| 2 | 204026 | 218365 | 3 | Alive | Good | honeylocust | 40.717581 | -73.936608 | 
Заполнение данных
import featureform as ff
# @ff.entity
# class Example:
#   age = ff.MultiFeature(dataset, dataframe, variant="simple", include_columns=["age", "top_item"], entity_column="user", timestamp_column="timestamp", inference_store=redis)
@ff.entity
class PreparedData:
    # Register multiple columns from a dataset as features
    cleaned_data = ff.MultiFeature(
        cleaning_dataset,
        client.dataframe(cleaning_dataset, limit=10),
        entity_column="tree_id",
        include_columns=
            ["block_id", "tree_dbh", "status", "health", "spc_common", "latitude", "longitude"],
        inference_store=redis,
        # variant=VARIANT,
    )
    
    labeled = ff.Label(
        cleaning_dataset[["tree_id", "block_id"]], 
        type=ff.Bool,
    )
training_set = ff.register_training_set(
    "training_set",
    label=PreparedData.labeled,
    features=PreparedData.cleaned_data,
)
client.apply()No resources to apply
Applying Run: 2024-06-30t21-37-08
Creating entity prepareddata
Creating feature health 2024-06-30t21-37-08
Creating feature longitude 2024-06-30t21-37-08
Creating feature latitude 2024-06-30t21-37-08
Creating feature block_id 2024-06-30t21-37-08
Creating feature status 2024-06-30t21-37-08
Creating feature spc_common 2024-06-30t21-37-08
Creating feature tree_dbh 2024-06-30t21-37-08
Creating label labeled 2024-06-30t21-37-08
Creating training-set training_set 2024-06-30t21-37-08