In [1]:
import os
import datetime
import pandas as pd
import sklearn
from sklearn.metrics import silhouette_score
import tslearn
from tslearn.preprocessing import TimeSeriesScalerMinMax
from tslearn.clustering import TimeSeriesKMeans
import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np
seed = 42
np.random.seed(seed)

Nahrání dat

V této části nahrávám a upravuji data do potřebného tvaru.

Metadata

Nahrání souboru s metadaty o zákaznících - např. sazba či počet fází.

In [2]:
AREA = "kunratice"
df_meta = pd.read_csv("../../scripts/data/metadata.csv", usecols=["id", "id.1", "tdd_class", "area"])
df_meta_kunr = df_meta.loc[(df_meta["area"] == AREA) | (df_meta["area"] == "barrandov") | (df_meta["area"] == "smichov")]
def f(x):
    x = x.split("_")
    return f"{x[0].strip()} {x[1].strip()}" 
ids = df_meta_kunr["id.1"]
In [3]:
def load_data(folder_path: str, month_set: set, area) -> pd.DataFrame:
    dfs = []
    all_data = os.listdir(folder_path)
    for file in all_data:
        for month in month_set:
            if file.rfind(month) != -1:
                df_part = pd.read_csv(f"{folder_path}{file}", usecols=['DeviceID', 'Timestamp', 'ActiveEnergyImport'])
                df_part["DeviceID"] = df_part["DeviceID"].apply(lambda x: "_".join(x.split()+[area]))
                dfs.append(df_part)
    return dfs

Nahrání všech dat do jednoho DataFrame

In [4]:
MONTHS = {"2016-12", "2017-02", "2017-01",}
dfs_kunr = load_data(f"../../scripts/data/{AREA}/", MONTHS, AREA)
dfs_barr= load_data(f"../../scripts/data/barrandov/", MONTHS, "barrandov")
dfs_smi = load_data(f"../../scripts/data/smichov/", MONTHS, "smichov")

df = pd.concat([*dfs_kunr, *dfs_smi, *dfs_barr], ignore_index=True)
df.tail(2)
Out[4]:
DeviceID Timestamp ActiveEnergyImport
47322082 ODBERATEL_9_barrandov 16.02.2017 5:59:00 47959.0
47322083 ODBERATEL_9_barrandov 16.02.2017 6:00:00 47959.0

Úprava sloupce Timestamp na native datetime a nastavení tohoto sloupce jako index datové struktury.

In [5]:
df = df.loc[df["DeviceID"].isin(list(ids))]
df.loc[:, "Timestamp"] = df["Timestamp"].apply(lambda x: str(x).strip())
df["Timestamp"] = pd.to_datetime(df["Timestamp"], format="%d.%m.%Y %H:%M:%S")
df = df.set_index("Timestamp")
df = df.dropna()
df.head(2)
Out[5]:
DeviceID ActiveEnergyImport
Timestamp
2017-02-09 08:01:00 TRAFOSTANICE_1_kunratice 2474940.0
2017-02-09 08:02:00 TRAFOSTANICE_1_kunratice 2474950.0

Pomocná funkce, která přidává sloupce Consumption - znázorňující spotřebu a 15mins - zobrazující 15 minutový interval.

In [6]:
def modify_group(group):
    """
    Adds Consumption and norm_consumption to group
    """
    group.index = group.index.sort_values()
    # resample by hour
    # resample by 15 min -> "15T
    group = group.resample("15T").max() 
    group.loc[:, "Consumption"] = group["ActiveEnergyImport"].diff()
    group.loc[group["Consumption"] < 0, "Consumption"] = 0
    group.loc[group["Consumption"] > 1000, "Consumption"] = 0
    group["15mins"] = group.index.time
    group["Timestamp"] = group.index
          
    return group
In [7]:
groups = df.groupby("DeviceID")

Aplikace pomocné funkce na agregovaná data podle sloupce DeviceID

In [8]:
groups = groups.apply(modify_group)
In [9]:
modified_df = groups.reset_index(drop=True).set_index("Timestamp")
In [10]:
modified_df["Timestamp"] = modified_df.index
group_mean = modified_df.groupby(["DeviceID","15mins"]).mean()

Zde upravím agregovaná data zpátky na DataFrame

In [11]:
gst = group_mean.reset_index()
gst.head()
Out[11]:
DeviceID 15mins ActiveEnergyImport Consumption
0 ODBERATEL_100_barrandov 00:00:00 730681.677778 61.088889
1 ODBERATEL_100_barrandov 00:15:00 727652.400000 57.155556
2 ODBERATEL_100_barrandov 00:30:00 727713.744444 61.344444
3 ODBERATEL_100_barrandov 00:45:00 727785.455556 71.711111
4 ODBERATEL_100_barrandov 01:00:00 727852.477778 67.022222
In [12]:
def normalize_consumption(group):
    group["norm_cons"] = group["Consumption"] / group["Consumption"].max()
    return group
In [13]:
gst_group = gst.groupby("DeviceID").apply(normalize_consumption)
gst_pivoted = gst_group.pivot_table(index="DeviceID", columns="15mins", values="norm_cons")

Clustering

V této části provádím shlukovou analýzu. Nejprve je potřeba vybrat vhodný počet shluků a poté provést a zhodnotit shlukovou analýzu.

Vybrání vhodného počtu shluků

Vhodný počet shluků dělám podle hodnoty setrvačnosti (inertia).

In [14]:
inertias = []
N_MAX = 40
range_ = range(2, N_MAX, 2)
for n in range_:
    kmeans_n = sklearn.cluster.KMeans(random_state=42, n_clusters=n)
    kmeans_n.fit(X=gst_pivoted.values)
    inertias.append(kmeans_n.inertia_)


plt.figure(figsize=(20,10))
ax = plt.gca()
ax.tick_params(axis = 'both', which = 'major', labelsize = 16)
plt.axis([0, 40, 0, 1200])
plt.plot(range_, inertias, linewidth=3)
plt.title("Závislost setrvačnosti (inertia) na parametru k (počet shluků)", fontdict={"fontsize": 20})
plt.xlabel("k (počet shluků) [-]", fontdict={"fontsize": 15})
plt.ylabel("Setrvačnost [-]", fontdict={"fontsize": 15})

plt.show()

Na základě grafu výše jsem se rozhodl pro hodnotu 11.

Clustering

Po vybrání vhodného počtu clusterů jsem vytvořil několik pomocných funkcí k zobrazení clusterů a dalších užitečných věcí.

In [15]:
CLUSTER_XLABELS = [datetime.time.strftime(x, format="%H:%M") for x in gst["15mins"].unique()]
n_clusters = 11
In [16]:
def plot_clusters(data, predictions, n_clusters, xlabels, cluster_centers):
    """
    Plots all clusters as individual plots
    """
    for yi in range(n_clusters):
        fig, ax = plt.subplots(figsize=(20,10))
        plt.xticks(rotation=90)
        ax.tick_params(axis = 'both', which = 'major', labelsize = 16)
        for xx in data.values[predictions == yi]:
            ax.plot(xx.ravel(), "k-", alpha=.2)
        ax.plot(cluster_centers[yi].ravel(), "r-", linewidth=2)
        ax.set_xticks(range(96))
        ax.set_xticklabels(xlabels)
        ax.set_ylabel("Normovaná spotřeba typického dne [-]", fontdict={"fontsize": 15})
        ax.set_xlabel("Čas [h]", fontdict={"fontsize": 15})
        ax.set_ylim(0, 1)
        ax.set_xlim(0, 96)
        for label in ax.xaxis.get_ticklabels()[1::2]:
            label.set_visible(False)
        plt.title(f"Cluster {yi + 1}", fontdict={"fontsize": 20})
        plt.show()
In [17]:
def plot_clusters_subplot(data, predictions, n_clusters, xlabels, cluster_centers):
    """
    Plots all clusters in one plot as subplots
    """
    yticklabels = [str(x / 4) for x in range(0, 5)]
    plt.figure(figsize=(20,20))
    for yi in range(n_clusters):
        axes = plt.subplot(n_clusters//2+1, 2, 1+yi)
        plt.xticks(rotation=90)
        axes.tick_params(axis = 'both', which = 'major', labelsize = 16)
        for xx in data.values[predictions == yi]:
            axes.plot(xx.ravel(), "k-", alpha=.2)
        axes.plot(cluster_centers[yi].ravel(), "r-", linewidth=2)
        axes.set_xticks(range(96))
        axes.set_xticklabels(xlabels, fontdict={"fontsize": 12})
        axes.set_yticklabels(axes.get_yticklabels(), fontdict={"fontsize": 12})
        axes.set_ylim(0, 1)
        axes.set_yticklabels(yticklabels, fontdict={"fontsize": 12})
        axes.set_xlim(0, 96)
        for label in axes.xaxis.get_ticklabels()[1::2]:
            label.set_visible(False)
        axes.set_title(f"Cluster {yi + 1}", fontdict={"fontsize": 12})
        if yi == n_clusters // 2 - 1:
            axes.set_ylabel("Normovaná spotřeba typického dne [-]", fontdict={"fontsize": 11})
            axes.set_xlabel("Čas [h]", fontdict={"fontsize": 11})
    plt.tight_layout()
    plt.show()
In [18]:
def plot_corr_matrix(index, predictions, n_clusters):
    """
    Plots corr matrix that shows assigned table with clusters and consumers
    """
    df = pd.DataFrame({"DeviceID": index.index, "cluster": predictions})
    df = df.apply(lambda row: add_tdd_class(row), axis=1)
    df_grouped = df.groupby(["cluster", "tdd_class"]).count()
    df_unstacked = df_grouped.unstack().fillna(0)
    fig, ax = plt.subplots(figsize=(20,10))
    
    im = ax.imshow(df_unstacked, cmap="Reds")
    r = ax.figure.colorbar(im, ax=ax)
    ticks = range(0, n_clusters)
    plt.yticks(ticks)
    ax.set_yticklabels(range(1, n_clusters+1))
    # plt.xticklabels()
    ax.set_ylabel("Cluster [-]", fontdict={"fontsize": 15})
    ax.set_xlabel("třída TDD [-]", fontdict={"fontsize": 15})
    xtickslabels = ["", "neznámá", *[str(x) for x in range(1,8)]]

    ax.set_xticklabels(xtickslabels)
    for i, cas in enumerate(df_unstacked.index):
        for j, c in enumerate(df_unstacked.iloc[i]):
            ax.text(j-.2, i+.2, c, fontsize=14)
    plt.title("Rozdělení tříd TDD v jednotlivých clusterech", fontdict={"fontsize": 20})
    fig.tight_layout()
    plt.show()
In [19]:
def add_tdd_class(row):
    """
    Adds TDD class to every row
    """
    res = df_meta_kunr.loc[df_meta_kunr["id.1"] == row["DeviceID"]]
    if pd.isna(res.iloc[0]["tdd_class"]):
        tdd_class = 0
    else:
        tdd_class = res.iloc[0]["tdd_class"]
    row["tdd_class"] = tdd_class
    return row

Euklidovská metrika

Zde provádím spuštění algoritmu a jejich vykreslení do grafu

In [20]:
kmeans = sklearn.cluster.KMeans(random_state=42, n_clusters=n_clusters, max_iter=300)
pred = kmeans.fit_predict(X=gst_pivoted.values)
plot_clusters_subplot(gst_pivoted, pred, n_clusters, CLUSTER_XLABELS, kmeans.cluster_centers_)

Jelikož shluková analýza odhalila špatná data, provedl jsem ji znovu bez daného clusteru (zde cluster 8).

In [21]:
n_clusters = 11
wo_bad_data = gst_pivoted.iloc[pred != 8]
kmeans_bad = sklearn.cluster.KMeans(random_state=42, n_clusters=n_clusters)
pred_bad = kmeans_bad.fit_predict(X=wo_bad_data.values)
plot_clusters_subplot(wo_bad_data, pred_bad, n_clusters, CLUSTER_XLABELS, kmeans_bad.cluster_centers_)
In [22]:
plot_corr_matrix(wo_bad_data, pred_bad, n_clusters)

DTW metrika

Zde provádím shlukovou analýzu pomocí metriky DTW

In [23]:
ts_kmeans = tslearn.clustering.TimeSeriesKMeans(n_clusters=n_clusters, metric="dtw")
In [24]:
ts_pred = ts_kmeans.fit_predict(gst_pivoted)
In [25]:
tsdata = []
for i, arr in enumerate(wo_bad_data.values):
    tsdata.append(arr)

tsdata_all = tslearn.utils.to_time_series_dataset([*tsdata])
In [26]:
print("DTW k-means")
kmts = TimeSeriesKMeans(n_clusters=n_clusters, verbose=True, metric="dtw", random_state=seed, n_init=2, max_iter_barycenter=10)

y_pred = kmts.fit_predict(tsdata_all)
DTW k-means
Init 1
0.581 --> 0.408 --> 0.396 --> 0.393 --> 0.393 --> 0.393 --> 0.393 --> 0.393 --> 0.393 --> 0.393 --> 
Init 2
0.590 --> 0.411 --> 0.397 --> 0.394 --> 0.392 --> 0.392 --> 0.390 --> 0.389 --> 0.388 --> 0.387 --> 0.386 --> 0.385 --> 0.385 --> 0.385 --> 
In [27]:
plot_clusters_subplot(wo_bad_data, y_pred, n_clusters, CLUSTER_XLABELS, kmts.cluster_centers_)
In [28]:
plot_corr_matrix(wo_bad_data, pred_bad, n_clusters)

Aplikace špičkového tarifu

In [29]:
new_tariff_cluster = wo_bad_data.iloc[pred_bad == 5]
new_tariff_cluster.head()
Out[29]:
15mins 00:00:00 00:15:00 00:30:00 00:45:00 01:00:00 01:15:00 01:30:00 01:45:00 02:00:00 02:15:00 ... 21:30:00 21:45:00 22:00:00 22:15:00 22:30:00 22:45:00 23:00:00 23:15:00 23:30:00 23:45:00
DeviceID
ODBERATEL_102_barrandov 0.475072 0.498343 0.441788 0.541499 0.582258 0.547987 0.558353 0.570905 0.517241 0.535858 ... 0.619632 0.557083 0.436429 0.464636 0.507792 0.502715 0.491432 0.542910 0.502362 0.544884
ODBERATEL_104_barrandov 0.528244 0.534472 0.608094 0.436745 0.438763 0.537203 0.462869 0.475812 0.449332 0.521657 ... 0.613244 0.730012 0.583014 0.628281 0.584335 0.608718 0.505183 0.434318 0.509026 0.584696
ODBERATEL_111_smichov 0.485201 0.534577 0.541857 0.444094 0.514296 0.570458 0.621940 0.516376 0.555377 0.552257 ... 0.625264 0.579281 0.569767 0.565011 0.476216 0.433404 0.473044 0.489429 0.489429 0.478330
ODBERATEL_119_barrandov 0.504143 0.592092 0.635847 0.597457 0.617861 0.601387 0.521207 0.492868 0.558614 0.479039 ... 0.733583 0.745889 0.728647 0.673179 0.689783 0.693605 0.551511 0.560579 0.547854 0.606212
ODBERATEL_11_barrandov 0.716935 0.578629 0.511694 0.525000 0.491129 0.866129 0.580242 0.469355 0.440726 0.471371 ... 0.638306 0.748790 0.727419 0.579435 0.504435 0.503226 0.591129 0.658468 0.719355 0.564516

5 rows × 96 columns

In [30]:
new_tariff_df = new_tariff_cluster.unstack().reset_index()
In [31]:
LIMIT_VALUE = 0.8
def modify_peak_values(group):
    group_date = group[(group["15mins"] >= datetime.time(16,0)) & (group["15mins"] <= datetime.time(20,0))]
    vals = group_date[0].subtract(LIMIT_VALUE)
    corr_vals = vals[vals > 0]
    index_list = corr_vals.index.tolist()
    _time = group.loc[corr_vals.index]['15mins']
    new_time = pd.to_timedelta(_time.astype(str) - pd.Timedelta(hours=15))
    new_time_index_list = []
    for t in new_time:
        time_vals = str(t).split()[2].split(":")[:2]
        h = int(time_vals[0][1:])
        m = int(time_vals[1])
        ind = group[group["15mins"] == datetime.time(h, m)].index[0]
        new_time_index_list.append(ind)
    if not group.index.empty:
        for i, n_i in zip(index_list, new_time_index_list):
            group.at[i, 0] =  group.loc[i][0] - corr_vals[i]
            group.at[n_i, 0] = group.loc[n_i][0] + corr_vals[i]
    return group
In [32]:
_groups = new_tariff_df.groupby("DeviceID")
new_tariff_df_modified = _groups.apply(modify_peak_values).pivot_table(index="DeviceID", columns="15mins", values=0)
new_tariff_cluster_mean = new_tariff_df_modified.unstack().reset_index().groupby("15mins").mean()
In [33]:
fig, ax = plt.subplots(figsize=(20,10))
plt.xticks(rotation=90)
ax.tick_params(axis = 'both', which = 'major', labelsize = 16)
for xx in new_tariff_df_modified.values:
    ax.plot(xx.ravel(), "k-", alpha=.2)
ax.plot(new_tariff_cluster_mean.values.ravel(), "go-", linewidth=3)
ax.plot(kmeans_bad.cluster_centers_[5], "r-", linewidth=2)
ax.set_xticks(range(96))
ax.set_xticklabels(CLUSTER_XLABELS)
ax.set_ylabel("Normovaná spotřeba typického dne [-]", fontdict={"fontsize": 15})
ax.set_xlabel("Čas [h]", fontdict={"fontsize": 15})
ax.set_ylim(0, 1)
ax.set_xlim(0, 96)
for label in ax.xaxis.get_ticklabels()[1::2]:
    label.set_visible(False)
plt.title(f"Cluster {6}", fontdict={"fontsize": 20})
plt.show()