1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
|
""" @author: abc @file: xyzforest.py @date: 2017-01-06 """ __author__ = "abc"
import numpy as np import pandas as pd import csv from sklearn.ensemble import RandomForestClassifier
class Titanic(object): """ Titanic """ def __init__(self): """ __init__ """ self.train_path = "/home/abc/Projects/kaggle/Titanic/train.csv" self.test_path = "/home/abc/Projects/kaggle/Titanic/test.csv"
def load_data(self, path): """ 加载数据 :param path: :return: """ return pd.read_csv(path, header=0)
def wash_train_data(self, train_data): """ 清洗训练数据 :param train_data: :return: """ train_data.Sex = train_data.Sex.map({'female': 0, 'male': 1}).astype(int)
if len(train_data.Embarked[train_data.Embarked.isnull()]) > 0: train_data.Embarked[train_data.Embarked.isnull()] = train_data.Embarked.dropna().mode().values ports_dict = {name: index for index, name in enumerate(set(train_data.Embarked))} train_data.Embarked = train_data.Embarked.map(ports_dict).astype(int)
if len(train_data.Age[train_data.Age.isnull()]) > 0: train_data.Age[train_data.Age.isnull()] = train_data.Age.dropna().median()
train_data = train_data.drop(self.drop_label(), axis=1)
self.data_concat(train_data)
return train_data.values
def wash_test_data(self, test_data): """ 清洗测试数据 :param test_data: :return: """ test_data.Sex = test_data.Sex.map({'female': 0, 'male': 1}).astype(int)
if len(test_data.Embarked[test_data.Embarked.isnull()]) > 0: test_data.Embarked[test_data.Embarked.isnull()] = test_data.Embarked.dropna().mode().values ports_dict = {name: index for index, name in enumerate(np.unique(test_data.Embarked))} test_data.Embarked = test_data.Embarked.map(ports_dict).astype(int)
if len(test_data.Age[test_data.Age.isnull()]) > 0: test_data.Age[test_data.Age.isnull()] = test_data.Age.dropna().median()
if len(test_data.Fare[test_data.Fare.isnull()]) > 0: median_fare = np.zeros(3) for f in range(0, 3): median_fare[f] = test_data[test_data.Pclass == f + 1]['Fare'].dropna().median() for f in range(0, 3): test_data.loc[(test_data.Fare.isnull()) & (test_data.Pclass == f + 1), 'Fare'] = median_fare[f]
test_data = test_data.drop(self.drop_label(), axis=1)
self.data_concat(test_data)
return test_data.values
def drop_label(self): """ drop_label :return: """ return ['Name', 'Ticket', 'PassengerId']
def data_concat(self, raw_data): """ data_concat :param raw_data: :return: """ raw_data.loc[raw_data.Age < 18, "Age"] = 0 raw_data.loc[raw_data.Age >= 18, "Age"][raw_data.Age < 45] = 1 raw_data.loc[raw_data.Age >= 45, "Age"] = 2 raw_data.loc[raw_data.Parch > 0, "Parch"] = 1 raw_data.loc[raw_data.Cabin.isnull(), "Cabin"] = 1 raw_data.loc[raw_data.Cabin.notnull(), "Cabin"] = 0
def rediction_model(self, train_data, label_data, test_data): """ 预测模型 :param train_data: :param test_data: :return: """ forest = RandomForestClassifier(n_estimators=2000) forest = forest.fit(train_data, label_data) return forest.predict(test_data).astype(int)
def save_result(self, path, head, data): """ save_result :param path: :param head: :param data: :return: """ with open(path, "wb") as wbf: csv_obj = csv.writer(wbf) csv_obj.writerow(head) for item in data: csv_obj.writerow(list(item))
def run(self): """ 运行 :return: """ train_data = self.load_data(self.train_path) test_data = self.load_data(self.test_path) passenger_id = test_data.PassengerId.values train_data = self.wash_train_data(train_data) test_data = self.wash_test_data(test_data) train_data, label_data = train_data[0::, 1::], [val for val in train_data[0::, 0]] head = ["PassengerId", "Survived"] data = zip(passenger_id, self.rediction_model(train_data, label_data, test_data)) self.save_result("xyzforest.csv", head, data)
if __name__ == "__main__": tt = Titanic() print tt.run()
|