import os
import datetime
import pandas as pd
import sklearn
from sklearn.metrics import silhouette_score
import tslearn
from tslearn.preprocessing import TimeSeriesScalerMinMax
from tslearn.clustering import TimeSeriesKMeans
import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np
seed = 42
np.random.seed(seed)

Nahrání dat¶

V této části nahrávám a upravuji data do potřebného tvaru.

Metadata¶

Nahrání souboru s metadaty o zákaznících - např. sazba či počet fází.

AREA = "kunratice"
df_meta = pd.read_csv("../../scripts/data/metadata.csv", usecols=["id", "id.1", "tdd_class", "area"])
df_meta_kunr = df_meta.loc[(df_meta["area"] == AREA) | (df_meta["area"] == "barrandov") | (df_meta["area"] == "smichov")]
def f(x):
    x = x.split("_")
    return f"{x[0].strip()} {x[1].strip()}" 
ids = df_meta_kunr["id.1"]

def load_data(folder_path: str, month_set: set, area) -> pd.DataFrame:
    dfs = []
    all_data = os.listdir(folder_path)
    for file in all_data:
        for month in month_set:
            if file.rfind(month) != -1:
                df_part = pd.read_csv(f"{folder_path}{file}", usecols=['DeviceID', 'Timestamp', 'ActiveEnergyImport'])
                df_part["DeviceID"] = df_part["DeviceID"].apply(lambda x: "_".join(x.split()+[area]))
                dfs.append(df_part)
    return dfs

Nahrání všech dat do jednoho DataFrame

MONTHS = {"2016-12", "2017-02", "2017-01",}
dfs_kunr = load_data(f"../../scripts/data/{AREA}/", MONTHS, AREA)
dfs_barr= load_data(f"../../scripts/data/barrandov/", MONTHS, "barrandov")
dfs_smi = load_data(f"../../scripts/data/smichov/", MONTHS, "smichov")

df = pd.concat([*dfs_kunr, *dfs_smi, *dfs_barr], ignore_index=True)
df.tail(2)

Úprava sloupce Timestamp na native datetime a nastavení tohoto sloupce jako index datové struktury.

df = df.loc[df["DeviceID"].isin(list(ids))]
df.loc[:, "Timestamp"] = df["Timestamp"].apply(lambda x: str(x).strip())
df["Timestamp"] = pd.to_datetime(df["Timestamp"], format="%d.%m.%Y %H:%M:%S")
df = df.set_index("Timestamp")
df = df.dropna()
df.head(2)

Pomocná funkce, která přidává sloupce Consumption - znázorňující spotřebu a 15mins - zobrazující 15 minutový interval.

def modify_group(group):
    """
    Adds Consumption and norm_consumption to group
    """
    group.index = group.index.sort_values()
    # resample by hour
    # resample by 15 min -> "15T
    group = group.resample("15T").max() 
    group.loc[:, "Consumption"] = group["ActiveEnergyImport"].diff()
    group.loc[group["Consumption"] < 0, "Consumption"] = 0
    group.loc[group["Consumption"] > 1000, "Consumption"] = 0
    group["15mins"] = group.index.time
    group["Timestamp"] = group.index
          
    return group

groups = df.groupby("DeviceID")

Aplikace pomocné funkce na agregovaná data podle sloupce DeviceID

groups = groups.apply(modify_group)

modified_df = groups.reset_index(drop=True).set_index("Timestamp")

modified_df["Timestamp"] = modified_df.index
group_mean = modified_df.groupby(["DeviceID","15mins"]).mean()

Zde upravím agregovaná data zpátky na DataFrame

gst = group_mean.reset_index()
gst.head()

def normalize_consumption(group):
    group["norm_cons"] = group["Consumption"] / group["Consumption"].max()
    return group

gst_group = gst.groupby("DeviceID").apply(normalize_consumption)
gst_pivoted = gst_group.pivot_table(index="DeviceID", columns="15mins", values="norm_cons")

Clustering¶

V této části provádím shlukovou analýzu. Nejprve je potřeba vybrat vhodný počet shluků a poté provést a zhodnotit shlukovou analýzu.

Vybrání vhodného počtu shluků¶

Vhodný počet shluků dělám podle hodnoty setrvačnosti (inertia).

inertias = []
N_MAX = 40
range_ = range(2, N_MAX, 2)
for n in range_:
    kmeans_n = sklearn.cluster.KMeans(random_state=42, n_clusters=n)
    kmeans_n.fit(X=gst_pivoted.values)
    inertias.append(kmeans_n.inertia_)


plt.figure(figsize=(20,10))
ax = plt.gca()
ax.tick_params(axis = 'both', which = 'major', labelsize = 16)
plt.axis([0, 40, 0, 1200])
plt.plot(range_, inertias, linewidth=3)
plt.title("Závislost setrvačnosti (inertia) na parametru k (počet shluků)", fontdict={"fontsize": 20})
plt.xlabel("k (počet shluků) [-]", fontdict={"fontsize": 15})
plt.ylabel("Setrvačnost [-]", fontdict={"fontsize": 15})

plt.show()

Na základě grafu výše jsem se rozhodl pro hodnotu 11.

Clustering¶

Po vybrání vhodného počtu clusterů jsem vytvořil několik pomocných funkcí k zobrazení clusterů a dalších užitečných věcí.

CLUSTER_XLABELS = [datetime.time.strftime(x, format="%H:%M") for x in gst["15mins"].unique()]
n_clusters = 11

def plot_clusters(data, predictions, n_clusters, xlabels, cluster_centers):
    """
    Plots all clusters as individual plots
    """
    for yi in range(n_clusters):
        fig, ax = plt.subplots(figsize=(20,10))
        plt.xticks(rotation=90)
        ax.tick_params(axis = 'both', which = 'major', labelsize = 16)
        for xx in data.values[predictions == yi]:
            ax.plot(xx.ravel(), "k-", alpha=.2)
        ax.plot(cluster_centers[yi].ravel(), "r-", linewidth=2)
        ax.set_xticks(range(96))
        ax.set_xticklabels(xlabels)
        ax.set_ylabel("Normovaná spotřeba typického dne [-]", fontdict={"fontsize": 15})
        ax.set_xlabel("Čas [h]", fontdict={"fontsize": 15})
        ax.set_ylim(0, 1)
        ax.set_xlim(0, 96)
        for label in ax.xaxis.get_ticklabels()[1::2]:
            label.set_visible(False)
        plt.title(f"Cluster {yi + 1}", fontdict={"fontsize": 20})
        plt.show()

def plot_clusters_subplot(data, predictions, n_clusters, xlabels, cluster_centers):
    """
    Plots all clusters in one plot as subplots
    """
    yticklabels = [str(x / 4) for x in range(0, 5)]
    plt.figure(figsize=(20,20))
    for yi in range(n_clusters):
        axes = plt.subplot(n_clusters//2+1, 2, 1+yi)
        plt.xticks(rotation=90)
        axes.tick_params(axis = 'both', which = 'major', labelsize = 16)
        for xx in data.values[predictions == yi]:
            axes.plot(xx.ravel(), "k-", alpha=.2)
        axes.plot(cluster_centers[yi].ravel(), "r-", linewidth=2)
        axes.set_xticks(range(96))
        axes.set_xticklabels(xlabels, fontdict={"fontsize": 12})
        axes.set_yticklabels(axes.get_yticklabels(), fontdict={"fontsize": 12})
        axes.set_ylim(0, 1)
        axes.set_yticklabels(yticklabels, fontdict={"fontsize": 12})
        axes.set_xlim(0, 96)
        for label in axes.xaxis.get_ticklabels()[1::2]:
            label.set_visible(False)
        axes.set_title(f"Cluster {yi + 1}", fontdict={"fontsize": 12})
        if yi == n_clusters // 2 - 1:
            axes.set_ylabel("Normovaná spotřeba typického dne [-]", fontdict={"fontsize": 11})
            axes.set_xlabel("Čas [h]", fontdict={"fontsize": 11})
    plt.tight_layout()
    plt.show()

def plot_corr_matrix(index, predictions, n_clusters):
    """
    Plots corr matrix that shows assigned table with clusters and consumers
    """
    df = pd.DataFrame({"DeviceID": index.index, "cluster": predictions})
    df = df.apply(lambda row: add_tdd_class(row), axis=1)
    df_grouped = df.groupby(["cluster", "tdd_class"]).count()
    df_unstacked = df_grouped.unstack().fillna(0)
    fig, ax = plt.subplots(figsize=(20,10))
    
    im = ax.imshow(df_unstacked, cmap="Reds")
    r = ax.figure.colorbar(im, ax=ax)
    ticks = range(0, n_clusters)
    plt.yticks(ticks)
    ax.set_yticklabels(range(1, n_clusters+1))
    # plt.xticklabels()
    ax.set_ylabel("Cluster [-]", fontdict={"fontsize": 15})
    ax.set_xlabel("třída TDD [-]", fontdict={"fontsize": 15})
    xtickslabels = ["", "neznámá", *[str(x) for x in range(1,8)]]

    ax.set_xticklabels(xtickslabels)
    for i, cas in enumerate(df_unstacked.index):
        for j, c in enumerate(df_unstacked.iloc[i]):
            ax.text(j-.2, i+.2, c, fontsize=14)
    plt.title("Rozdělení tříd TDD v jednotlivých clusterech", fontdict={"fontsize": 20})
    fig.tight_layout()
    plt.show()

def add_tdd_class(row):
    """
    Adds TDD class to every row
    """
    res = df_meta_kunr.loc[df_meta_kunr["id.1"] == row["DeviceID"]]
    if pd.isna(res.iloc[0]["tdd_class"]):
        tdd_class = 0
    else:
        tdd_class = res.iloc[0]["tdd_class"]
    row["tdd_class"] = tdd_class
    return row

Euklidovská metrika¶

Zde provádím spuštění algoritmu a jejich vykreslení do grafu

kmeans = sklearn.cluster.KMeans(random_state=42, n_clusters=n_clusters, max_iter=300)
pred = kmeans.fit_predict(X=gst_pivoted.values)
plot_clusters_subplot(gst_pivoted, pred, n_clusters, CLUSTER_XLABELS, kmeans.cluster_centers_)

Jelikož shluková analýza odhalila špatná data, provedl jsem ji znovu bez daného clusteru (zde cluster 8).

n_clusters = 11
wo_bad_data = gst_pivoted.iloc[pred != 8]
kmeans_bad = sklearn.cluster.KMeans(random_state=42, n_clusters=n_clusters)
pred_bad = kmeans_bad.fit_predict(X=wo_bad_data.values)
plot_clusters_subplot(wo_bad_data, pred_bad, n_clusters, CLUSTER_XLABELS, kmeans_bad.cluster_centers_)

plot_corr_matrix(wo_bad_data, pred_bad, n_clusters)

DTW metrika¶

Zde provádím shlukovou analýzu pomocí metriky DTW

ts_kmeans = tslearn.clustering.TimeSeriesKMeans(n_clusters=n_clusters, metric="dtw")

ts_pred = ts_kmeans.fit_predict(gst_pivoted)

tsdata = []
for i, arr in enumerate(wo_bad_data.values):
    tsdata.append(arr)

tsdata_all = tslearn.utils.to_time_series_dataset([*tsdata])

print("DTW k-means")
kmts = TimeSeriesKMeans(n_clusters=n_clusters, verbose=True, metric="dtw", random_state=seed, n_init=2, max_iter_barycenter=10)

y_pred = kmts.fit_predict(tsdata_all)

DTW k-means
Init 1
0.581 --> 0.408 --> 0.396 --> 0.393 --> 0.393 --> 0.393 --> 0.393 --> 0.393 --> 0.393 --> 0.393 --> 
Init 2
0.590 --> 0.411 --> 0.397 --> 0.394 --> 0.392 --> 0.392 --> 0.390 --> 0.389 --> 0.388 --> 0.387 --> 0.386 --> 0.385 --> 0.385 --> 0.385 -->

plot_clusters_subplot(wo_bad_data, y_pred, n_clusters, CLUSTER_XLABELS, kmts.cluster_centers_)

plot_corr_matrix(wo_bad_data, pred_bad, n_clusters)

Aplikace špičkového tarifu¶

new_tariff_cluster = wo_bad_data.iloc[pred_bad == 5]
new_tariff_cluster.head()

new_tariff_df = new_tariff_cluster.unstack().reset_index()

LIMIT_VALUE = 0.8
def modify_peak_values(group):
    group_date = group[(group["15mins"] >= datetime.time(16,0)) & (group["15mins"] <= datetime.time(20,0))]
    vals = group_date[0].subtract(LIMIT_VALUE)
    corr_vals = vals[vals > 0]
    index_list = corr_vals.index.tolist()
    _time = group.loc[corr_vals.index]['15mins']
    new_time = pd.to_timedelta(_time.astype(str) - pd.Timedelta(hours=15))
    new_time_index_list = []
    for t in new_time:
        time_vals = str(t).split()[2].split(":")[:2]
        h = int(time_vals[0][1:])
        m = int(time_vals[1])
        ind = group[group["15mins"] == datetime.time(h, m)].index[0]
        new_time_index_list.append(ind)
    if not group.index.empty:
        for i, n_i in zip(index_list, new_time_index_list):
            group.at[i, 0] =  group.loc[i][0] - corr_vals[i]
            group.at[n_i, 0] = group.loc[n_i][0] + corr_vals[i]
    return group

_groups = new_tariff_df.groupby("DeviceID")
new_tariff_df_modified = _groups.apply(modify_peak_values).pivot_table(index="DeviceID", columns="15mins", values=0)
new_tariff_cluster_mean = new_tariff_df_modified.unstack().reset_index().groupby("15mins").mean()

fig, ax = plt.subplots(figsize=(20,10))
plt.xticks(rotation=90)
ax.tick_params(axis = 'both', which = 'major', labelsize = 16)
for xx in new_tariff_df_modified.values:
    ax.plot(xx.ravel(), "k-", alpha=.2)
ax.plot(new_tariff_cluster_mean.values.ravel(), "go-", linewidth=3)
ax.plot(kmeans_bad.cluster_centers_[5], "r-", linewidth=2)
ax.set_xticks(range(96))
ax.set_xticklabels(CLUSTER_XLABELS)
ax.set_ylabel("Normovaná spotřeba typického dne [-]", fontdict={"fontsize": 15})
ax.set_xlabel("Čas [h]", fontdict={"fontsize": 15})
ax.set_ylim(0, 1)
ax.set_xlim(0, 96)
for label in ax.xaxis.get_ticklabels()[1::2]:
    label.set_visible(False)
plt.title(f"Cluster {6}", fontdict={"fontsize": 20})
plt.show()

	DeviceID	Timestamp	ActiveEnergyImport
47322082	ODBERATEL_9_barrandov	16.02.2017 5:59:00	47959.0
47322083	ODBERATEL_9_barrandov	16.02.2017 6:00:00	47959.0

	DeviceID	ActiveEnergyImport
Timestamp
2017-02-09 08:01:00	TRAFOSTANICE_1_kunratice	2474940.0
2017-02-09 08:02:00	TRAFOSTANICE_1_kunratice	2474950.0

	DeviceID	15mins	ActiveEnergyImport	Consumption
0	ODBERATEL_100_barrandov	00:00:00	730681.677778	61.088889
1	ODBERATEL_100_barrandov	00:15:00	727652.400000	57.155556
2	ODBERATEL_100_barrandov	00:30:00	727713.744444	61.344444
3	ODBERATEL_100_barrandov	00:45:00	727785.455556	71.711111
4	ODBERATEL_100_barrandov	01:00:00	727852.477778	67.022222

15mins	00:00:00	00:15:00	00:30:00	00:45:00	01:00:00	01:15:00	01:30:00	01:45:00	02:00:00	02:15:00	...	21:30:00	21:45:00	22:00:00	22:15:00	22:30:00	22:45:00	23:00:00	23:15:00	23:30:00	23:45:00
DeviceID
ODBERATEL_102_barrandov	0.475072	0.498343	0.441788	0.541499	0.582258	0.547987	0.558353	0.570905	0.517241	0.535858	...	0.619632	0.557083	0.436429	0.464636	0.507792	0.502715	0.491432	0.542910	0.502362	0.544884
ODBERATEL_104_barrandov	0.528244	0.534472	0.608094	0.436745	0.438763	0.537203	0.462869	0.475812	0.449332	0.521657	...	0.613244	0.730012	0.583014	0.628281	0.584335	0.608718	0.505183	0.434318	0.509026	0.584696
ODBERATEL_111_smichov	0.485201	0.534577	0.541857	0.444094	0.514296	0.570458	0.621940	0.516376	0.555377	0.552257	...	0.625264	0.579281	0.569767	0.565011	0.476216	0.433404	0.473044	0.489429	0.489429	0.478330
ODBERATEL_119_barrandov	0.504143	0.592092	0.635847	0.597457	0.617861	0.601387	0.521207	0.492868	0.558614	0.479039	...	0.733583	0.745889	0.728647	0.673179	0.689783	0.693605	0.551511	0.560579	0.547854	0.606212
ODBERATEL_11_barrandov	0.716935	0.578629	0.511694	0.525000	0.491129	0.866129	0.580242	0.469355	0.440726	0.471371	...	0.638306	0.748790	0.727419	0.579435	0.504435	0.503226	0.591129	0.658468	0.719355	0.564516