泰坦尼克号获救预测实战

在本次实战,会用到numpy和pandas等模块,不会的话,请补完自己的知识空区。

本次数据集的下载网址

数据预处理

1
2
3
4
5
import pandas as pd
import numpy as np
data=pd.read_csv("train.csv")
#查看数据集的字段
print(data.columns)

数据字段含义如下:

接着我们把一些没有什么用的字段删掉,比如Name、Ticket

1
data=data[["Survived","Pclass","Sex","Age","SibSp","Parch","Fare","Cabin","Embarked"]]

接着,我们发现有一些NAN的数据,我们用年龄的平均填充一下,即使使用平均年龄填充并不是很好

1
data["Age"]=data["Age"].fillna(data["Age"].mean())

对Cabin字段数值化,数值化返回的是一个元祖,元祖后的第一项就是数值化后的值

1
data["Cabin"]=pd.factorize(data.Cabin)[0]

对其它的NAN数据都填充为0

1
data.fillna(0,inplace=True)

对Sex特征进行数值化

1
data["Sex"]=[1 if x=="male" else 0 for x in data.Sex]

对Pclass进行one-hot编码,添加三个特征,将Pclass特征删除

1
2
3
4
data["p1"]=np.array(data["Pclass"]==1).astype(np.int32)
data["p2"]=np.array(data["Pclass"]==2).astype(np.int32)
data["p3"]=np.array(data["Pclass"]==3).astype(np.int32)
del data["Pclass"]

对Embarked,首先知道它有三个港口,0是我们刚刚填充NAN的。

1
2
print(data.Embarked.unique())
#['S' 'C' 'Q' 0]

接着对Embarked进行one-hot编码,再删除Embarked特征

1
2
3
4
data["e1"]=np.array(data["Embarked"]=="S").astype(np.int32)
data["e2"]=np.array(data["Embarked"]=="C").astype(np.int32)
data["e3"]=np.array(data["Embarked"]=="Q").astype(np.int32)
del data["Embarked"]

数据的预处理就结束了

提取训练数据

1
data_train=data[["Sex","Age","SibSp","Parch","Fare","Cabin","p1","p2","p3","e1","e2","e3"]]

数据标签

1
data_target=data["Survived"].values.reshape(len(data),1)

搭建神经网络

1
2
3
4
5
6
7
8
9
10
x = tf.placeholder("float", shape=[None, 12])
y = tf.placeholder("float", shape=[None, 1])

weight = tf.Variable(tf.random_normal([12, 1]))
bias = tf.Variable(tf.random_normal([1]))
output = tf.matmul(x, weight) + bias
pred = tf.cast(tf.sigmoid(output) > 0.5, tf.float32)
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=output))
train_step = tf.train.GradientDescentOptimizer(0.001).minimize(loss)
accurary = tf.reduce_mean(tf.cast(tf.equal(pred, y), tf.float32))

可视化

1
2
3
4
5
6
7
8
9
10
11
import matplotlib.pyplot as plt

plt.plot(loss_train, 'k-')
plt.title('train loss')
plt.show()

plt.plot(train_acc, 'b-', label='train_acc')
plt.plot(test_acc, 'r--', label='test_acc')
plt.title('train and test accuracy')
plt.legend()
plt.show()

全部代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# -*- coding: utf-8 -*-
import tensorflow as tf
import pandas as pd
import numpy as np
data = pd.read_csv('train.csv')
data = data[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']]
data['Age'] = data['Age'].fillna(data['Age'].mean())
data['Cabin'] = pd.factorize(data['Cabin'])[0]
data.fillna(0, inplace=True)
data['Sex'] = [1 if x=='male' else 0 for x in data['Sex']]
data['p1'] = np.array(data['Pclass']==1).astype(np.int32)
data['p2'] = np.array(data['Pclass']==2).astype(np.int32)
data['p3'] = np.array(data['Pclass']==3).astype(np.int32)
del data['Pclass']
data['e1'] = np.array(data['Embarked']=='S').astype(np.int32)
data['e2'] = np.array(data['Embarked']=='C').astype(np.int32)
data['e3'] = np.array(data['Embarked']=='Q').astype(np.int32)
del data['Embarked']

data_train = data[[ 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'p1', 'p2', 'p3', 'e1', 'e2', 'e3']]
data_target = data['Survived'].values.reshape(len(data), 1)

x = tf.placeholder("float", shape=[None, 12])
y = tf.placeholder("float", shape=[None, 1])

weight = tf.Variable(tf.random_normal([12, 1]))
bias = tf.Variable(tf.random_normal([1]))
output = tf.matmul(x, weight) + bias
pred = tf.cast(tf.sigmoid(output) > 0.5, tf.float32)
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=output))
train_step = tf.train.GradientDescentOptimizer(0.001).minimize(loss)
accurary = tf.reduce_mean(tf.cast(tf.equal(pred, y), tf.float32))

data_test = pd.read_csv('test.csv')
data_test = data_test[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']]
data_test['Age'] = data_test['Age'].fillna(data_test['Age'].mean())
data_test['Cabin'] = pd.factorize(data_test['Cabin'])[0]
data_test.fillna(0, inplace=True)
data_test['Sex'] = [1 if x=='male' else 0 for x in data_test['Sex']]
data_test['p1'] = np.array(data_test['Pclass']==1).astype(np.int32)
data_test['p2'] = np.array(data_test['Pclass']==2).astype(np.int32)
data_test['p3'] = np.array(data_test['Pclass']==3).astype(np.int32)
del data_test['Pclass']
data_test['e1'] = np.array(data_test['Embarked']=='S').astype(np.int32)
data_test['e2'] = np.array(data_test['Embarked']=='C').astype(np.int32)
data_test['e3'] = np.array(data_test['Embarked']=='Q').astype(np.int32)
del data_test['Embarked']

test_label = pd.read_csv('gender_submission.csv')
test_label = np.reshape(test_label['Survived'].values.astype(np.float32), (418,1))

sess = tf.Session()
sess.run(tf.global_variables_initializer())
loss_train=[]
train_acc = []
test_acc = []
#这句代码不可缺
data_train = data_train.values
for i in range(25000):
index = np.random.permutation(len(data_target))
data_train = data_train[index]
data_target = data_target[index]
for n in range(len(data_target)//100 + 1):
batch_xs = data_train[n*100:n*100+100]
batch_ys = data_target[n*100:n*100+100]
sess.run(train_step, feed_dict={x:batch_xs, y:batch_ys})
if i%1000 == 0:
loss_temp = sess.run(loss, feed_dict={x:batch_xs, y:batch_ys})
loss_train.append(loss_temp)
train_acc_temp = sess.run(accurary, feed_dict={x:batch_xs, y:batch_ys})
train_acc.append(train_acc_temp)
test_acc_temp = sess.run(accurary, feed_dict={x:data_test, y:test_label})
test_acc.append(test_acc_temp)
print(loss_temp,train_acc_temp,test_acc_temp)

import matplotlib.pyplot as plt

plt.plot(loss_train, 'k-')
plt.title('train loss')
plt.show()

plt.plot(train_acc, 'b-', label='train_acc')
plt.plot(test_acc, 'r--', label='test_acc')
plt.title('train and test accuracy')
plt.legend()
plt.show()
----本文结束,感谢您的阅读。如有错,请指正。----
大哥大嫂过年好!支持我一下呗
0%