import os
import datetime
import pandas as pd
import sklearn
from sklearn.metrics import silhouette_score
import tslearn
from tslearn.preprocessing import TimeSeriesScalerMinMax
from tslearn.clustering import TimeSeriesKMeans
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
seed = 42
np.random.seed(seed)
AREA = "kunratice"
df_meta = pd.read_csv("../../scripts/data/metadata.csv", usecols=["id", "id.1", "tdd_class", "area"])
df_meta_kunr = df_meta.loc[(df_meta["area"] == AREA) | (df_meta["area"] == "barrandov") | (df_meta["area"] == "smichov")]
def f(x):
x = x.split("_")
return f"{x[0].strip()} {x[1].strip()}"
ids = df_meta_kunr["id.1"]
def load_data(folder_path: str, month_set: set, area) -> pd.DataFrame:
dfs = []
all_data = os.listdir(folder_path)
for file in all_data:
for month in month_set:
if file.rfind(month) != -1:
df_part = pd.read_csv(f"{folder_path}{file}", usecols=['DeviceID', 'Timestamp', 'ActiveEnergyImport'])
df_part["DeviceID"] = df_part["DeviceID"].apply(lambda x: "_".join(x.split()+[area]))
dfs.append(df_part)
return dfs
Nahrání všech dat do jednoho DataFrame
MONTHS = {"2016-12", "2017-02", "2017-01",}
dfs_kunr = load_data(f"../../scripts/data/{AREA}/", MONTHS, AREA)
dfs_barr= load_data(f"../../scripts/data/barrandov/", MONTHS, "barrandov")
dfs_smi = load_data(f"../../scripts/data/smichov/", MONTHS, "smichov")
df = pd.concat([*dfs_kunr, *dfs_smi, *dfs_barr], ignore_index=True)
df.tail(2)
Úprava sloupce Timestamp
na native datetime a nastavení tohoto sloupce jako index datové struktury.
df = df.loc[df["DeviceID"].isin(list(ids))]
df.loc[:, "Timestamp"] = df["Timestamp"].apply(lambda x: str(x).strip())
df["Timestamp"] = pd.to_datetime(df["Timestamp"], format="%d.%m.%Y %H:%M:%S")
df = df.set_index("Timestamp")
df = df.dropna()
df.head(2)
Pomocná funkce, která přidává sloupce Consumption
- znázorňující spotřebu a 15mins
- zobrazující 15 minutový interval.
def modify_group(group):
"""
Adds Consumption and norm_consumption to group
"""
group.index = group.index.sort_values()
# resample by hour
# resample by 15 min -> "15T
group = group.resample("15T").max()
group.loc[:, "Consumption"] = group["ActiveEnergyImport"].diff()
group.loc[group["Consumption"] < 0, "Consumption"] = 0
group.loc[group["Consumption"] > 1000, "Consumption"] = 0
group["15mins"] = group.index.time
group["Timestamp"] = group.index
return group
groups = df.groupby("DeviceID")
Aplikace pomocné funkce na agregovaná data podle sloupce DeviceID
groups = groups.apply(modify_group)
modified_df = groups.reset_index(drop=True).set_index("Timestamp")
modified_df["Timestamp"] = modified_df.index
group_mean = modified_df.groupby(["DeviceID","15mins"]).mean()
Zde upravím agregovaná data zpátky na DataFrame
gst = group_mean.reset_index()
gst.head()
def normalize_consumption(group):
group["norm_cons"] = group["Consumption"] / group["Consumption"].max()
return group
gst_group = gst.groupby("DeviceID").apply(normalize_consumption)
gst_pivoted = gst_group.pivot_table(index="DeviceID", columns="15mins", values="norm_cons")
inertias = []
N_MAX = 40
range_ = range(2, N_MAX, 2)
for n in range_:
kmeans_n = sklearn.cluster.KMeans(random_state=42, n_clusters=n)
kmeans_n.fit(X=gst_pivoted.values)
inertias.append(kmeans_n.inertia_)
plt.figure(figsize=(20,10))
ax = plt.gca()
ax.tick_params(axis = 'both', which = 'major', labelsize = 16)
plt.axis([0, 40, 0, 1200])
plt.plot(range_, inertias, linewidth=3)
plt.title("Závislost setrvačnosti (inertia) na parametru k (počet shluků)", fontdict={"fontsize": 20})
plt.xlabel("k (počet shluků) [-]", fontdict={"fontsize": 15})
plt.ylabel("Setrvačnost [-]", fontdict={"fontsize": 15})
plt.show()
Na základě grafu výše jsem se rozhodl pro hodnotu 11.
Po vybrání vhodného počtu clusterů jsem vytvořil několik pomocných funkcí k zobrazení clusterů a dalších užitečných věcí.
CLUSTER_XLABELS = [datetime.time.strftime(x, format="%H:%M") for x in gst["15mins"].unique()]
n_clusters = 11
def plot_clusters(data, predictions, n_clusters, xlabels, cluster_centers):
"""
Plots all clusters as individual plots
"""
for yi in range(n_clusters):
fig, ax = plt.subplots(figsize=(20,10))
plt.xticks(rotation=90)
ax.tick_params(axis = 'both', which = 'major', labelsize = 16)
for xx in data.values[predictions == yi]:
ax.plot(xx.ravel(), "k-", alpha=.2)
ax.plot(cluster_centers[yi].ravel(), "r-", linewidth=2)
ax.set_xticks(range(96))
ax.set_xticklabels(xlabels)
ax.set_ylabel("Normovaná spotřeba typického dne [-]", fontdict={"fontsize": 15})
ax.set_xlabel("Čas [h]", fontdict={"fontsize": 15})
ax.set_ylim(0, 1)
ax.set_xlim(0, 96)
for label in ax.xaxis.get_ticklabels()[1::2]:
label.set_visible(False)
plt.title(f"Cluster {yi + 1}", fontdict={"fontsize": 20})
plt.show()
def plot_clusters_subplot(data, predictions, n_clusters, xlabels, cluster_centers):
"""
Plots all clusters in one plot as subplots
"""
yticklabels = [str(x / 4) for x in range(0, 5)]
plt.figure(figsize=(20,20))
for yi in range(n_clusters):
axes = plt.subplot(n_clusters//2+1, 2, 1+yi)
plt.xticks(rotation=90)
axes.tick_params(axis = 'both', which = 'major', labelsize = 16)
for xx in data.values[predictions == yi]:
axes.plot(xx.ravel(), "k-", alpha=.2)
axes.plot(cluster_centers[yi].ravel(), "r-", linewidth=2)
axes.set_xticks(range(96))
axes.set_xticklabels(xlabels, fontdict={"fontsize": 12})
axes.set_yticklabels(axes.get_yticklabels(), fontdict={"fontsize": 12})
axes.set_ylim(0, 1)
axes.set_yticklabels(yticklabels, fontdict={"fontsize": 12})
axes.set_xlim(0, 96)
for label in axes.xaxis.get_ticklabels()[1::2]:
label.set_visible(False)
axes.set_title(f"Cluster {yi + 1}", fontdict={"fontsize": 12})
if yi == n_clusters // 2 - 1:
axes.set_ylabel("Normovaná spotřeba typického dne [-]", fontdict={"fontsize": 11})
axes.set_xlabel("Čas [h]", fontdict={"fontsize": 11})
plt.tight_layout()
plt.show()
def plot_corr_matrix(index, predictions, n_clusters):
"""
Plots corr matrix that shows assigned table with clusters and consumers
"""
df = pd.DataFrame({"DeviceID": index.index, "cluster": predictions})
df = df.apply(lambda row: add_tdd_class(row), axis=1)
df_grouped = df.groupby(["cluster", "tdd_class"]).count()
df_unstacked = df_grouped.unstack().fillna(0)
fig, ax = plt.subplots(figsize=(20,10))
im = ax.imshow(df_unstacked, cmap="Reds")
r = ax.figure.colorbar(im, ax=ax)
ticks = range(0, n_clusters)
plt.yticks(ticks)
ax.set_yticklabels(range(1, n_clusters+1))
# plt.xticklabels()
ax.set_ylabel("Cluster [-]", fontdict={"fontsize": 15})
ax.set_xlabel("třída TDD [-]", fontdict={"fontsize": 15})
xtickslabels = ["", "neznámá", *[str(x) for x in range(1,8)]]
ax.set_xticklabels(xtickslabels)
for i, cas in enumerate(df_unstacked.index):
for j, c in enumerate(df_unstacked.iloc[i]):
ax.text(j-.2, i+.2, c, fontsize=14)
plt.title("Rozdělení tříd TDD v jednotlivých clusterech", fontdict={"fontsize": 20})
fig.tight_layout()
plt.show()
def add_tdd_class(row):
"""
Adds TDD class to every row
"""
res = df_meta_kunr.loc[df_meta_kunr["id.1"] == row["DeviceID"]]
if pd.isna(res.iloc[0]["tdd_class"]):
tdd_class = 0
else:
tdd_class = res.iloc[0]["tdd_class"]
row["tdd_class"] = tdd_class
return row
Zde provádím spuštění algoritmu a jejich vykreslení do grafu
kmeans = sklearn.cluster.KMeans(random_state=42, n_clusters=n_clusters, max_iter=300)
pred = kmeans.fit_predict(X=gst_pivoted.values)
plot_clusters_subplot(gst_pivoted, pred, n_clusters, CLUSTER_XLABELS, kmeans.cluster_centers_)
Jelikož shluková analýza odhalila špatná data, provedl jsem ji znovu bez daného clusteru (zde cluster 8
).
n_clusters = 11
wo_bad_data = gst_pivoted.iloc[pred != 8]
kmeans_bad = sklearn.cluster.KMeans(random_state=42, n_clusters=n_clusters)
pred_bad = kmeans_bad.fit_predict(X=wo_bad_data.values)
plot_clusters_subplot(wo_bad_data, pred_bad, n_clusters, CLUSTER_XLABELS, kmeans_bad.cluster_centers_)
plot_corr_matrix(wo_bad_data, pred_bad, n_clusters)
Zde provádím shlukovou analýzu pomocí metriky DTW
ts_kmeans = tslearn.clustering.TimeSeriesKMeans(n_clusters=n_clusters, metric="dtw")
ts_pred = ts_kmeans.fit_predict(gst_pivoted)
tsdata = []
for i, arr in enumerate(wo_bad_data.values):
tsdata.append(arr)
tsdata_all = tslearn.utils.to_time_series_dataset([*tsdata])
print("DTW k-means")
kmts = TimeSeriesKMeans(n_clusters=n_clusters, verbose=True, metric="dtw", random_state=seed, n_init=2, max_iter_barycenter=10)
y_pred = kmts.fit_predict(tsdata_all)
plot_clusters_subplot(wo_bad_data, y_pred, n_clusters, CLUSTER_XLABELS, kmts.cluster_centers_)
plot_corr_matrix(wo_bad_data, pred_bad, n_clusters)
new_tariff_cluster = wo_bad_data.iloc[pred_bad == 5]
new_tariff_cluster.head()
new_tariff_df = new_tariff_cluster.unstack().reset_index()
LIMIT_VALUE = 0.8
def modify_peak_values(group):
group_date = group[(group["15mins"] >= datetime.time(16,0)) & (group["15mins"] <= datetime.time(20,0))]
vals = group_date[0].subtract(LIMIT_VALUE)
corr_vals = vals[vals > 0]
index_list = corr_vals.index.tolist()
_time = group.loc[corr_vals.index]['15mins']
new_time = pd.to_timedelta(_time.astype(str) - pd.Timedelta(hours=15))
new_time_index_list = []
for t in new_time:
time_vals = str(t).split()[2].split(":")[:2]
h = int(time_vals[0][1:])
m = int(time_vals[1])
ind = group[group["15mins"] == datetime.time(h, m)].index[0]
new_time_index_list.append(ind)
if not group.index.empty:
for i, n_i in zip(index_list, new_time_index_list):
group.at[i, 0] = group.loc[i][0] - corr_vals[i]
group.at[n_i, 0] = group.loc[n_i][0] + corr_vals[i]
return group
_groups = new_tariff_df.groupby("DeviceID")
new_tariff_df_modified = _groups.apply(modify_peak_values).pivot_table(index="DeviceID", columns="15mins", values=0)
new_tariff_cluster_mean = new_tariff_df_modified.unstack().reset_index().groupby("15mins").mean()
fig, ax = plt.subplots(figsize=(20,10))
plt.xticks(rotation=90)
ax.tick_params(axis = 'both', which = 'major', labelsize = 16)
for xx in new_tariff_df_modified.values:
ax.plot(xx.ravel(), "k-", alpha=.2)
ax.plot(new_tariff_cluster_mean.values.ravel(), "go-", linewidth=3)
ax.plot(kmeans_bad.cluster_centers_[5], "r-", linewidth=2)
ax.set_xticks(range(96))
ax.set_xticklabels(CLUSTER_XLABELS)
ax.set_ylabel("Normovaná spotřeba typického dne [-]", fontdict={"fontsize": 15})
ax.set_xlabel("Čas [h]", fontdict={"fontsize": 15})
ax.set_ylim(0, 1)
ax.set_xlim(0, 96)
for label in ax.xaxis.get_ticklabels()[1::2]:
label.set_visible(False)
plt.title(f"Cluster {6}", fontdict={"fontsize": 20})
plt.show()