Feature Engineering
1, Import the package and read the data
import pandas as pd import numpy as np import matplotlib.pyplot as plt import
seabornas sns import datetime from tqdm import tqdm from sklearn.preprocessing
import LabelEncoder from sklearn.feature_selection import SelectKBest from
sklearn.feature_selection import chi2 from sklearn.preprocessing import
MinMaxScalerimport xgboost as xgb import lightgbm as lgb from catboost import
CatBoostRegressorimport warnings from sklearn.model_selection import
StratifiedKFold, KFold from sklearn.metrics import accuracy_score, f1_score,
roc_auc_score, log_loss warnings.filterwarnings('ignore') # read file data_train=pd.
read_csv('tianchi/train.csv') data_test_a=pd.read_csv('tianchi/testA.csv')
2, Feature preprocessing
data EDA In part, we have an idea of the general distribution of the data and some of its characteristics , In general, we have to deal with some data preprocessing EDA Problems from stage analysis , This paper introduces the filling of missing data values , Time feature transformation processing , Processing of some object class features .
First, we find out the object features and numerical features in the data
numerical_fea = list(data_train.select_dtypes(exclude=['object']).columns)
category_fea= list(filter(lambda x: x not in numerical_fea,list(data_train.
columns))) label = 'isDefault' numerical_fea.remove(label)
Missing value filling
Replaces all missing values with the specified values 0
data_train = data_train.fillna(0)
Replace the missing value with the value above the missing value
data_train = data_train.fillna(axis=0,method=‘ffill’)
Vertically replace the missing value with the value below the missing value , At most two consecutive missing values are filled
data_train = data_train.fillna(axis=0,method=‘bfill’,limit=2)
# View missing values data_train.isnull().sum() # Fill in numerical features by means data_train[numerical_fea] =
data_train[numerical_fea].fillna(data_train[numerical_fea].median()) data_test_a
[numerical_fea] = data_test_a[numerical_fea].fillna(data_train[numerical_fea].
median()) # Fill category features by mode data_train[category_fea] = data_train[category_fea].
fillna(data_train[category_fea].mode()) data_test_a[category_fea] = data_test_a[
category_fea].fillna(data_train[category_fea].mode())
Time format processing
# Convert to time format for data in [data_train, data_test_a]: data['issueDate'] = pd.
to_datetime(data['issueDate'],format='%Y-%m-%d') startdate = datetime.datetime.
strptime('2007-06-01', '%Y-%m-%d') # Tectonic time characteristics data['issueDateDT'] = data[
'issueDate'].apply(lambda x: x-startdate).dt.days data_train['employmentLength']
.value_counts(dropna=False).sort_index()
Conversion of object type characteristics to numeric values
def employmentLength_to_int(s): if pd.isnull(s): return s else: return np.int8(
s.split()[0]) for data in [data_train, data_test_a]: data['employmentLength'].
replace(to_replace='10+ years', value='10 years', inplace=True) data[
'employmentLength'].replace('< 1 year', '0 years', inplace=True) data[
'employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)
data['employmentLength'].value_counts(dropna=False).sort_index()
Technology
Daily Recommendation