In [2]:
import json

import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

from pprint import pprint

import plotly.io as pio
pio.renderers.default='jupyterlab'  # notebook doesn't work check https://plotly.com/python/renderers/
pd.options.display.max_columns = 100
pd.options.display.max_rows = 300

In [3]:
mats = pd.read_csv("in/tables/MATERIALS_FOR_CLASSIFICATION_CLEAN.csv")
cats = pd.read_csv("in/tables/MATERIAL_CATEGORY_FOR_CLASSIFICATION.csv")

df = pd.merge(mats,cats,on=["MATERIAL_ODS_ID","SHOP"])

def create_category_dict(df):
    return df.groupby("CATEGORY_ID").CATEGORY_PATH.unique().to_dict()
category_dict = create_category_dict(df)

# should be applied after category_decoder_dict
df = df[~df.MATERIAL_ID.duplicated()].reset_index(drop=True).copy()

for k,v in category_dict.items():
    df.loc[df.CATEGORY_ID == k,"CATEGORY_PATH_FIX"] = v[0]

In [4]:
subset = df.copy()

In [5]:
subset.CATEGORY_PATH = subset.CATEGORY_PATH.str.split(" > ").apply(lambda x: " > ".join(x[1:]))

In [6]:
from sklearn.preprocessing import LabelEncoder

# encode matcat names as integer values because transformer models do not accept strings
subset['CATEGORY_SEQ_TOKENS'] = subset['CATEGORY_PATH'].str.split(" > ")
unique_tokens = []
for i in subset['CATEGORY_SEQ_TOKENS']:
    unique_tokens.extend(i)
    
label_enc = LabelEncoder()
labeled_tokens = label_enc.fit_transform(unique_tokens)

tokens_seq = []
pos=0
for i in subset['CATEGORY_SEQ_TOKENS']:
    tokens_seq.append(labeled_tokens[pos:pos+len(i)])
    pos+=len(i)
    
# new from transformers
int_tokens_seq = [str(i).strip("[ ]").split(" ") for i in tokens_seq]

In [7]:
for i in int_tokens_seq:
    try:
        for _ in range(10): del i[i.index("")]
    except:
        continue

In [8]:
subset['CATEGORY_SEQ'] = tokens_seq

In [9]:
subset["CATEGORY_SEQ_STR"] = subset.CATEGORY_SEQ.apply(lambda x: " ".join(x.astype(str)))

In [10]:
def prepare_MT_data(row):
    return row["NAME_BRIEF"] + " \t " + row["CATEGORY_SEQ_STR"]

In [11]:
subset["text"] = subset.apply(prepare_MT_data, axis=1)

In [29]:
# test_categories = df.CATEGORY_ID.value_counts().reset_index().query("CATEGORY_ID > 20")["index"].values
# df = df.query("CATEGORY_ID in @test_categories").copy()

def remove_too_small_categories(df, how_many=20):
    """
    Remove all categories small than @how_many, 
    because it is unsufficient to add them to classification.
    These categories need to be evaluated in final testing.
    """
    too_small_cats = df.CATEGORY_ID.value_counts().reset_index().query("CATEGORY_ID<@how_many")["index"].unique()
    indices = df[df.CATEGORY_ID.isin(too_small_cats)].index
    small_cat_df = df.iloc[indices].copy()
    print(f"Len of df before: {df.shape[0]}")
    df.drop(indices, inplace=True)
    print(f"Len of df after: {df.shape[0]}")
    return small_cat_df

too_small_categories_df = remove_too_small_categories(subset)



Len of df before: 240998
Len of df after: 240998


In [13]:
from sklearn.model_selection import train_test_split

def create_datasets(df, stratify_by="CATEGORY_ID",make_valid=None, random_state=1):
    """
    Create train/test, possibly also validation dataset.
    If make_valid=True, then returns df,list(df,df)
    """
    train, test = train_test_split(df.reset_index(drop=True), 
                                   stratify=df[stratify_by],
                                   test_size=0.3, 
                                   random_state=random_state)
    if make_valid:
        test = train_test_split(test.reset_index(drop=True),
                                test_size=0.3,
                                random_state=random_state)
    
    return train, test

def over_sample(df, categories_smaller=100):
    """
    Oversample for really small categories
    """
    new_df = pd.DataFrame()
    # smaller categories
    small_categories = (df.CATEGORY_ID
                        .value_counts()
                        .reset_index()
                        .query("CATEGORY_ID < 100"))
    
    for _,cat_id, count in small_categories.itertuples():
        append_another = categories_smaller-count
        df_cat = df.query("CATEGORY_ID == @cat_id")
        new_df = new_df.append(df_cat.iloc[np.random.randint(df_cat.shape[0],size=append_another)])
    
    print(f"Appending {new_df.shape[0]} of new samples")
    return pd.concat([df,new_df])



In [14]:
subset = subset[["MATERIAL_ID","NAME_BRIEF","text","CATEGORY_SEQ", "CATEGORY_PATH", "CATEGORY_ID"]].copy()
# subset = subset[subset.NAME.str.len() >9].copy().reset_index()
# subset.NAME = subset.NAME.str.lower()
# subset.NAME = subset.NAME.str.replace('\n', '')

In [15]:

train, test = create_datasets(subset)
test, validation = create_datasets(test)

In [19]:
train.shape, test.shape, validation.shape

((168698, 6), (50610, 6), (21690, 6))

In [20]:
import csv
def save_set(subset,name):
    subset[["text"]].to_csv(f'data/mt/categorizer.cs_cat.{name}.tsv', index=False, sep="$", header=False,
                           quoting = csv.QUOTE_NONE, escapechar = ' ')
    subset[["NAME_BRIEF"]].to_csv(f'data/mt/categorizer.cs_cat.{name}.tok.cs', index=False, sep="$", header=False,
                           quoting = csv.QUOTE_NONE, escapechar = ' ')
    subset[["CATEGORY_SEQ"]].to_csv(f'data/mt/categorizer.cs_cat.{name}.tok.cat', index=False, sep="$", header=False,
                           quoting = csv.QUOTE_NONE, escapechar = ' ')

In [21]:
!mkdir -p data/mt

In [22]:
save_set(train, "train")
save_set(test, "test")
save_set(validation, "valid")

In [23]:
!fairseq-preprocess \
    --source-lang cs \
    --target-lang cat \
    --trainpref data/mt/categorizer.cs_cat.train.tok \
    --validpref data/mt/categorizer.cs_cat.valid.tok \
    --testpref data/mt/categorizer.cs_cat.test.tok \
    --destdir data/mt-bin \
    --thresholdsrc 3 \
    --thresholdtgt 3

2022-01-02 17:46:07 | INFO | fairseq_cli.preprocess | Namespace(align_suffix=None, alignfile=None, all_gather_list_size=16384, bf16=False, bpe=None, checkpoint_shard_count=1, checkpoint_suffix='', cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='data/mt-bin', empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, only_source=False, optimizer=None, padding_factor=8, profile=False, quantization_config_path=None, scoring='bleu', seed=1, source_lang='cs', srcdict=None, target_lang='cat', task='translation', tensorboard_logdir=None, testpref='data/mt/categorizer.cs_cat.test.tok', tgtdict=None, threshold_loss_scale=None, thresholdsrc=3, thresholdtgt=3, tokenizer=No

In [40]:
# pip install tensorboardX tensorflow keras==2.6

Collecting tensorboardX
  Using cached tensorboardX-2.4-py2.py3-none-any.whl (124 kB)
Collecting tensorflow
  Downloading tensorflow-2.7.0-cp38-cp38-manylinux2010_x86_64.whl (489.6 MB)
[K     |███████████████████▍            | 296.6 MB 133.6 MB/s eta 0:00:02

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



[K     |████████████████████████████████| 489.6 MB 20 kB/s /s eta 0:00:01
[?25hCollecting keras==2.6
  Downloading keras-2.6.0-py2.py3-none-any.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 118.0 MB/s eta 0:00:01
Collecting google-pasta>=0.1.1
  Downloading google_pasta-0.2.0-py3-none-any.whl (57 kB)
[K     |████████████████████████████████| 57 kB 12.2 MB/s  eta 0:00:01
[?25hCollecting wrapt>=1.11.0
  Downloading wrapt-1.13.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (84 kB)
[K     |████████████████████████████████| 84 kB 10.2 MB/s  eta 0:00:01
[?25hCollecting keras-preprocessing>=1.1.1
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[K     |████████████████████████████████| 42 kB 4.2 MB/s s eta 0:00:01
[?25hCollecting absl-py>=0.4.0
  Downloading absl_py-1.0.0-py3-none-any.whl (126 kB)
[K     |████████████████████████████████| 126 kB 122.2 MB/s eta 0:00:01
[?25hCollecting flatbuffers<

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



[K     |████████████████████████████████| 458.4 MB 23 kB/s /s eta 0:00:01
[?25hCollecting termcolor~=1.1.0
  Downloading termcolor-1.1.0.tar.gz (3.9 kB)
Collecting absl-py~=0.10
  Downloading absl_py-0.15.0-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 85.0 MB/s eta 0:00:01
[?25hCollecting typing-extensions~=3.7.4
  Downloading typing_extensions-3.7.4.3-py3-none-any.whl (22 kB)
Collecting astunparse~=1.6.3
  Downloading astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting flatbuffers~=1.12.0
  Downloading flatbuffers-1.12-py2.py3-none-any.whl (15 kB)
Collecting h5py~=3.1.0
  Downloading h5py-3.1.0-cp38-cp38-manylinux1_x86_64.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 119.4 MB/s eta 0:00:01
[?25hCollecting six>=1.9
  Downloading six-1.15.0-py2.py3-none-any.whl (10 kB)
Collecting wrapt~=1.12.1
  Downloading wrapt-1.12.1.tar.gz (27 kB)
Collecting gast==0.4.0
  Downloading gast-0.4.0-py3-none-any.whl (9.8 kB)
Collecting tensorflow-e

In [90]:
!fairseq-train \
    data/mt-bin \
    --arch lstm \
    --share-decoder-input-output-embed \
    --optimizer adam \
    --save-dir data/mt-ckpt \
    --tokenizer moses \
    --decoder-hidden-size 256 \
    --encoder-hidden-size 256 \
    --dropout 0.2 \
    --lr 1.0e-3 \
    --max-tokens 4096

2022-01-02 18:05:23 | INFO | fairseq_cli.train | Namespace(adam_betas='(0.9, 0.999)', adam_eps=1e-08, adaptive_softmax_cutoff='10000,50000,200000', all_gather_list_size=16384, arch='lstm', batch_size=None, batch_size_valid=None, best_checkpoint_metric='loss', bf16=False, bpe=None, broadcast_buffers=False, bucket_cap_mb=25, checkpoint_shard_count=1, checkpoint_suffix='', clip_norm=0.0, cpu=False, criterion='cross_entropy', curriculum=0, data='data/mt-bin', data_buffer_size=10, dataset_impl=None, ddp_backend='c10d', decoder_attention='1', decoder_dropout_in=0.2, decoder_dropout_out=0.2, decoder_embed_dim=512, decoder_embed_path=None, decoder_freeze_embed=False, decoder_hidden_size=256, decoder_layers=1, decoder_out_embed_dim=512, device_id=0, disable_validation=False, distributed_backend='nccl', distributed_init_method=None, distributed_no_spawn=False, distributed_num_procs=0, distributed_port=-1, distributed_rank=0, distributed_world_size=1, distributed_wrapper='DDP', dropout=0.2, empty

In [67]:
!fairseq-generate \
    data/mt-bin \
    --path data/mt-ckpt/checkpoint_best.pt \
    --tokenizer moses \
    --beam 3 \
    --results-path data/mt-test

  beams_buf = indices_buf // vocab_size
  unfin_idx = idx // beam_size
                                                                                

In [68]:
!grep ^T data/mt-test/generate-test.txt | cut -f1- > name.txt

In [69]:
!grep ^T data/mt-test/generate-test.txt | cut -f2- > target.txt

In [70]:
!grep ^H data/mt-test/generate-test.txt | cut -f3- > hypotheses.txt

In [136]:
results = pd.read_csv("name.txt", sep="\t", header=None)

In [137]:
y_pred = pd.read_csv("hypotheses.txt",header=None)
y_pred = y_pred.rename(columns={0:"pred"})

In [138]:
results["index"] = results[0].str.strip("T-")
results = pd.concat((results,y_pred),axis=1)

In [240]:
test = pd.read_csv("data/mt/categorizer.cs_cat.test.tsv", nrows=80000, sep="\t",header=None)

In [230]:
test = test.rename(columns={0:"NAME",1:"CATEGORY_SEQ_STR"})

In [247]:
evaluation = test.copy()

In [241]:
[int(i) for i in new.CATEGORY_SEQ_STR.str.split().tolist()[0]]

TypeError: 'float' object is not iterable

In [None]:
new_df = pd.DataFrame()
for _,i in results.iterrows():
    index = int(i["index"])
    if index<80000:
        string_categories = i["pred"].strip("[]").split(" ")
        categories = [int(i) for i in string_categories if i!=""]
        encoded_categories = label_enc.inverse_transform(categories)

        new = evaluation.iloc[index:index+1].copy()
        new["results"] = " > ".join(encoded_categories[1:])
        try:
            true_categories = label_enc.inverse_transform(([int(i) for i in new.CATEGORY_SEQ_STR.str.split().tolist()[0]]))
        except:
            continue
        new["CATEGORY_TRUE"] = " > ".join(true_categories[1:])
        size = (true_categories.shape if encoded_categories.shape >= true_categories.shape else encoded_categories.shape)[0]
        new["same"] = (encoded_categories[:size] == true_categories[:size]).all()
        new_df = new_df.append(new)

In [251]:
new_df.to_csv("test_lstm.csv",index=False)

In [288]:
subset["CATEGORY_TRUE"] = subset.CATEGORY_SEQ.apply(label_enc.inverse_transform)

In [306]:
new_df["results_post"] = new_df.results.apply(lambda x: " > ".join(x))

In [311]:
new_cats = new_df["results_post"].unique()
old_cats = subset.CATEGORY_PATH.unique()

In [321]:
for i in new_cats:
    if i in old_cats:
        pass
    else:
        print(i)
        print(new_df[(new_df.results_post == i)][["NAME","CATEGORY_TRUE"]].values)

Hobby a zahrada > Nářadí elektrické > Zahradní houpačky, houpací sítě
[['Diamond flexibilní brusný kotouč'
  array(['Hobby a zahrada', 'Nářadí elektrické', 'Příslušenství', 'Kotouče'],
        dtype='<U31')                                                        ]
 ['Diamond flexibilní brusný kotouč'
  array(['Hobby a zahrada', 'Nářadí elektrické', 'Příslušenství', 'Kotouče'],
        dtype='<U31')                                                        ]]
Hobby a zahrada > Kuchyně > Dřezy
[['Zorba 440E - Sand'
  array(['Hobby a zahrada', 'Dům, byt', 'Kuchyně', 'Dřezy'], dtype='<U31')]]
Hobby a zahrada > Stavby na zahradě > Příslušenství
[['Roleta RÁKOS NATUR 120 x 180 cm'
  array(['Hobby a zahrada', 'Stavby na zahradě', 'Ploty',
         'Zastínění oplocení'], dtype='<U31')            ]]
Hobby a zahrada > Dům, byt > Koupelna a sanitární technika > Vany, sprchy > Vany
[['Shower select baterie pod omítku pro 2 spotřebiče, chrom (15748000)'
  array(['Hobby a zahrada', 'Dům, byt', 'Koupelna

In [285]:
df.CATEGORY_PATH.str.startswith("Hobby a zahrada > Ruční nářadí").sum()

37151

In [280]:
df.CATEGORY_PATH.str.startswith("Hobby a zahrada > Nářadí ruční").sum()

15651

In [323]:
new_df[new_df.same == False].tail(50).values

array([[25006, 100054071141,
        'pilníky jehlové 160 mm v plastovém stojánku, sada 12 dílů, sek 2, NFB 2492 160/2 - PFERD',
        'pilníky jehlové 160 mm v plastovém stojánku, sada 12 dílů, sek 2, NFB 2492 160/2 - PFERD \t 5 31 17',
        array([ 5, 31, 17]),
        array(['Hobby a zahrada', 'Nářadí ruční', 'Nůžky, nože, pilky'],
              dtype='<U31')                                             ,
        array(['Hobby a zahrada', 'Ruční nářadí', 'Nůžky, nože, pilky'],
              dtype='<U31')                                             ,
        False, 'Hobby a zahrada > Nářadí ruční > Nůžky, nože, pilky'],
       [23497, 100023313606,
        'Rukavice OCS, 60310005, grilovací, originál, žáruvzdorná kůže, 100 % bavlna podšívky, kožený řemínek, 38 x 18,5 cmy,',
        'Rukavice OCS, 60310005, grilovací, originál, žáruvzdorná kůže, 100 % bavlna podšívky, kožený řemínek, 38 x 18,5 cmy, \t 5 3 28',
        array([ 5,  3, 28]),
        array(['Hobby a zahrada', 'Nářadí 

In [91]:
from fairseq.models.lstm import LSTMModel
model = LSTMModel.from_pretrained(model_name_or_path='/data/data',
                                  checkpoint_file='/data/data/mt-ckpt/checkpoint_best.pt',
                                  data_name_or_path='/data/data/mt-bin'
)

In [96]:
test_example = model.translate(test.NAME_BRIEF.values, beam=1,verbose=True)


__floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').


__floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').



In [97]:
predictions = []
seq = []
for i in test_example:
    sequence = i.split("]")[0].replace("[","").replace("]","").split()
    prediction = label_enc.inverse_transform([int(j) for j in sequence])
    
    seq.append(sequence)
    predictions.append(prediction)

In [101]:
seq[:10]

[['528', '387', '355', '433'],
 ['90', '577'],
 ['327', '426'],
 ['371', '280'],
 ['419', '301', '355'],
 ['334', '257'],
 ['334', '257'],
 ['90', '175', '491'],
 ['523', '201'],
 ['513', '126']]

In [115]:
train.to_csv("train_data_seq2seq.csv",index=False)

In [114]:
test.to_csv("test_data_seq2seq.csv",index=False)

In [116]:
validation.to_csv("valid_data_seq2seq.csv",index=False)

In [104]:
test["predictions"]= [" > ".join(i) for i in predictions]

In [107]:
test["pred_seqs"]= seq

In [113]:
test.query("CATEGORY_PATH != predictions")

Unnamed: 0,MATERIAL_ID,NAME_BRIEF,text,CATEGORY_SEQ,CATEGORY_PATH,CATEGORY_ID,predictions,pred_seqs
906,100038811702,stropn sušák ; stropn sušák prádl výborn pomoc...,stropn sušák ; stropn sušák prádl výborn pomoc...,"[90, 440]","Dům, byt > Sušáky na prádlo",35,"Dům, byt > Šňůry, sušáky na prádlo","[90, 577]"
37949,100040263921,napínák drátěn plot ocel stříbrn ; tat sad plo...,napínák drátěn plot ocel stříbrn ; tat sad plo...,"[419, 301, 112, 355]",Stavby na zahradě > Ploty > Gabiony > Přísluše...,157759,Stavby na zahradě > Ploty > Příslušenství,"[419, 301, 355]"
52373,3022275,gril bucket green rozbalit ; gril bucket kompa...,gril bucket green rozbalit ; gril bucket kompa...,"[119, 117]","Grily, udírny a kotlíky > Grily na dřevěné uhlí",1175162,"Grily, udírny a kotlíky > Plynové grily","[119, 303]"
53980,100066724714,sprch walk stříbr lesk skl sítotisk ; sprch sa...,sprch walk stříbr lesk skl sítotisk ; sprch sa...,"[90, 174, 546]","Dům, byt > Koupelna a sanitární technika > Zás...",6403678,"Dům, byt > Koupelna, sanitarni technika > Zást...","[90, 175, 546]"
27009,100049525093,matic nýtovac sad ; matic nýtovac sad ocel oce...,matic nýtovac sad ; matic nýtovac sad ocel oce...,"[371, 157, 244]",Ruční nářadí > Kleště > Nýtovací kleště,65221461,Ruční nářadí > Ostatní ruční nářadí,"[371, 280]"
...,...,...,...,...,...,...,...,...
11263,100021254305,kbelík čern ; praktick kbelík čern vhodn každ ...,kbelík čern ; praktick kbelík čern vhodn každ ...,"[90, 214, 215]","Dům, byt > Malířské potřeby > Malířské přísluš...",221509,Stavby na zahradě > Ploty > Příslušenství,"[419, 301, 355]"
19705,100030257853,altán krém textil ; tent prostorn zahradn altá...,altán krém textil ; tent prostorn zahradn altá...,"[523, 398, 418]","Zahradní nábytek > Slunečníky, zastínění > Sta...",5603,"Stavby na zahradě > Domky, altány > Altány","[419, 66, 23]"
60994,100050631647,vrták kov tundr hss kobalt válc stopk ; vrták ...,vrták kov tundr hss kobalt válc stopk ; vrták ...,"[241, 355, 499]",Nářadí elektrické > Příslušenství > Vrtáky a s...,10262329,"Nářadí elektrické > Příslušenství > Vrtáky, sa...","[241, 355, 500]"
16957,100066834977,temp kondel zahradn altánek modr gotan ; mater...,temp kondel zahradn altánek modr gotan ; mater...,"[419, 66, 23]","Stavby na zahradě > Domky, altány > Altány",5603,"Zahradní nábytek > Slunečníky, zastínění > Sta...","[523, 398, 418]"


In [100]:
from sklearn.metrics import f1_score

In [112]:
f1_score(test.CATEGORY_PATH, test.predictions, average="weighted")

0.7899187272879282

In [54]:
label_enc.inverse_transform([int(j) for j in i.strip("[ ]").split()])

array(['Nářadí zahradní', 'Pily'], dtype='<U39')

In [58]:
i.strip("[ ]").split()

['243', '290']

In [47]:
label_enc.inverse_transform([int(i) for i in test_example.strip("[ ]").split()])

AttributeError: 'list' object has no attribute 'strip'

In [37]:
for i in model.generate(model.encode(example)):
    tokens = [int(i) for i in model.decode(i["tokens"]).strip("[ ]").strip("]").split()]
    print(" > ".join(label_enc.inverse_transform(tokens)))

Nářadí zahradní > Pily
Ruční nářadí > Ostatní ruční nářadí
Dílna, stavební technika > Kompresory


ValueError: invalid literal for int() with base 10: '366]'

In [34]:
label_enc.inverse_transform([int(i) for i in test_example.strip("[ ]").split()])

array(['Nářadí zahradní', 'Pily'], dtype='<U39')