Kaggler

Published on Aug. 22, 2023, 12:09 p.m.

Kaggler

One-Hot, Label, Target, Frequency, and Embedding Encoders for Categorical Features
import pandas as pd
from kaggler.preprocessing import OneHotEncoder, LabelEncoder, TargetEncoder, FrequencyEncoder, EmbeddingEncoder

trn = pd.read\_csv('train.csv')
target\_col = trn.columns[-1]
cat\_cols = [col for col in trn.columns if trn[col].dtype == 'object']


ohe = OneHotEncoder(min\_obs=100) # grouping all categories with less than 100 occurences
lbe = LabelEncoder(min\_obs=100) # grouping all categories with less than 100 occurences
te = TargetEncoder() # replacing each category with the average target value of the category
fe = FrequencyEncoder() # replacing each category with the frequency value of the category
ee = EmbeddingEncoder() # mapping each category to a vector of real numbers


X\_ohe = ohe.fit\_transform(trn[cat\_cols]) # X\_ohe is a scipy sparse matrix
trn[cat\_cols] = lbe.fit\_transform(trn[cat\_cols])
trn[cat\_cols] = te.fit\_transform(trn[cat\_cols])
trn[cat\_cols] = fe.fit\_transform(trn[cat\_cols])
X\_ee = ee.fit\_transform(trn[cat\_cols], trn[target\_col]) # X\_ee is a numpy matrix


tst = pd.read\_csv('test.csv')
X\_ohe = ohe.transform(tst[cat\_cols])
tst[cat\_cols] = lbe.transform(tst[cat\_cols])
tst[cat\_cols] = te.transform(tst[cat\_cols])
tst[cat\_cols] = fe.transform(tst[cat\_cols])
X\_ee = ee.transform(tst[cat\_cols])


Tags: