KDDCUP99数据处理

lisyr44 / 2023-05-13 / 原文

代码实现如下

# 导入所需的库
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

# 读取数据集
df = pd.read_csv('kddcup.data_10_percent_corrected', header=None)

# 给每一列命名
df.columns = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent',
              'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root',
              'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
              'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate',
              'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
              'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
              'dst_host_same_src_port_rate',
              'dst_host_srv_diff_host_rate',
              'dst_host_serror_rate',
              'dst_host_srv_serror_rate',
              'dst_host_rerror_rate',
              'dst_host_srv_rerror_rate',
              'label']

# 数值化非数值型的特征
le = LabelEncoder()
df['protocol_type'] = le.fit_transform(df['protocol_type'])
df['service'] = le.fit_transform(df['service'])
df['flag'] = le.fit_transform(df['flag'])
df['label'] = le.fit_transform(df['label'])

# 标准化数值型的特征
scaler = StandardScaler()
numeric_features = ['duration','src_bytes','dst_bytes','wrong_fragment','urgent','hot','num_failed_logins','num_compromised','num_root','num_file_creations','num_shells','num_access_files','count','srv_count','serror_rate','srv_serror_rate','rerror_rate','srv_rerror_rate','same_srv_rate','diff_srv_rate','srv_diff_host_rate','dst_host_count','dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate','dst_host_srv_diff_host_rate','dst_host_serror_rate','dst_host_srv_serror_rate','dst_host_rerror_rate','dst_host_srv_rerror_rate']
df[numeric_features] = scaler.fit_transform(df[numeric_features])

# 归一化数值型的特征
scaler = MinMaxScaler()
df[numeric_features] = scaler.fit_transform(df[numeric_features])

# OneHot编码非数值型的特征
encoder = OneHotEncoder(sparse_output=False)
categorical_features = ['protocol_type','service','flag']
encoded_features = encoder.fit_transform(df[categorical_features])
encoded_features = pd.DataFrame(encoded_features)
df.drop(categorical_features, axis=1, inplace=True)
df = pd.concat([df, encoded_features], axis=1)

# 分割数据集为训练集、验证集和测试集
X = df.drop('label', axis=1)
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125, random_state=42)