import numpy as np
import pandas as pd
# 导入数据
test = pd.read_csv("C:/python study/study base/test.csv")  # 测试数据集
train = pd.read_csv("C:/python study/study base/train.csv")  # 训练数据集
# 查看数据集
print(train.info())
print('-------------------------------------------')
print(test.info())

# 删除缺失值过多的列
X_train = train.copy()
X_train = X_train.drop('Cabin',axis=1)

X_test = test.copy()
X_test = X_test.drop('Cabin',axis=1)

# 填补缺失值
X_train.loc[:, 'Age'] = X_train['Age'].fillna(X_train['Age'].median())
X_train.loc[:, 'Embarked'] = X_train['Embarked'].fillna(X_train['Embarked'].mode()[0])

X_test.loc[:, 'Age'] = X_test['Age'].fillna(X_test['Age'].median())
X_test.loc[:, 'Fare'] = X_test['Fare'].fillna(X_test['Fare'].median())


# 删除部分标签变量
X_train=X_train.drop('PassengerId', axis=1)
X_test=X_test.drop('PassengerId', axis=1)

X_train=X_train.drop('Name', axis=1)
X_test=X_test.drop('Name', axis=1)

X_train=X_train.drop('Ticket', axis=1)
X_test=X_test.drop('Ticket', axis=1)

# 训练集预测变量
y_train = X_train.pop('Survived')


# 'Embarked'采用热独编码
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)
encoded_array = encoder.fit_transform(X_train[['Embarked']])  # 注意双括号
encoded_array_int = encoded_array.astype(int)
encoded_columns = encoder.get_feature_names_out(['Embarked'])
encoded_df = pd.DataFrame(
    encoded_array_int,
    columns=encoded_columns,
    index=X_train.index  # 保持索引一致
)
X_train = pd.concat([X_train, encoded_df], axis=1)
# 'Sex'特征属于二分类标签,用标签编码将其转换为离散型数值
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X_train['Sex_code'] = le.fit_transform(X_train['Sex'])

# 将编码规则映射给test数据集
X_test = X_test.merge(
    X_train[["Sex", "Sex_code"]].drop_duplicates(),
    on="Sex",
    how="left",
)
X_test = X_test.merge(
    X_train[["Embarked", "Embarked_C",'Embarked_Q','Embarked_S']].drop_duplicates(),
    on="Embarked",
    how="left",
)

# 删除编码后的分类标签变量
X_train = X_train.drop('Sex',axis=1)
X_train = X_train.drop('Embarked',axis=1)

X_test = X_test.drop('Sex',axis=1)
X_test = X_test.drop('Embarked',axis=1)

# 调整列的顺序
X_test = X_test[X_train.columns]

# 训练模型并评分
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
model = RandomForestClassifier(random_state=0)
model.fit(X_train,y_train)
scores = cross_val_score(model,X_train, y_train,
                              cv=5,                
                              scoring='accuracy') 
print(scores.mean())

不做特征工程使用随机森林模型不调参数的命中率为80.9%。

Logo

AtomGit 是由开放原子开源基金会联合 CSDN 等生态伙伴共同推出的新一代开源与人工智能协作平台。平台坚持“开放、中立、公益”的理念,把代码托管、模型共享、数据集托管、智能体开发体验和算力服务整合在一起,为开发者提供从开发、训练到部署的一站式体验。

更多推荐