library(ggplot2)
library(psych)
library(nnet)
train=read.csv("C:/Users/秦文昭/Documents/python/titanic/train.csv",header=T,stringsAsFactors = FALSE)
test=read.csv("C:/Users/秦文昭/Documents/python/titanic/test.csv",header=T,stringsAsFactors = FALSE)
#str(train)
#str(test)
test=data.frame(test,"Survived"=NA)
test=test[,c(1,12,2,3,4,5,6,7,8,9,10,11)]
alldata=rbind(train,test)
#===================================数据的导入与整理============================
#pclass
#windows()
#Pclass_S <- table(train$Survived, train$Pclass)
#Pclass_S_prop <- prop.table(Pclass_S, 2)
#ggplot(data = train, aes(x = Pclass, fill = factor(Survived)))+geom_bar(stat='count', position='dodge') + scale_x_continuous(breaks=c(1:3)) + labs(x = 'Pclass')
#sex
#windows()
#Sex_S <- table(train$Survived, train$Sex)
#Sex_S_prop <- prop.table(Sex_S, 2)
#ggplot(data = train, aes(x = Sex, fill = factor(Survived)))+geom_bar(stat='count', position='dodge')
#age
#windows()
#Agedata <- as.numeric(unlist(train$Age))
#Age_S <- table(train$Survived, cut(Agedata, breaks = c(0, 15, 30, 45, 60, 75, 90), labels = c('kids', 'teenagers', 'prime', 'middle', 'agedness', 'senium' )))
#Age_S_prop <- prop.table(Age_S, 2)
#ggplot(data = data.frame(train$Survived, Agedata), aes(x = cut(Agedata, breaks = c(0, 15, 30, 45, 60, 75, 90)), fill = factor(train$Survived)))+geom_bar(stat='count', position='dodge') + labs(x = 'Age') + scale_x_discrete(labels = c('kids', 'teenagers', 'prime', 'middle', 'agedness', 'senium'))
#family
#Sibsp_S <- table(train$Survived, train$SibSp)
#Parch_S <- table(train$Survived, train$Parch)
#Sibsp_S_prop <- prop.table(Sibsp_S, 2)
#Parch_S_prop <- prop.table(Parch_S, 2)
#windows()
#ggplot(data = train, aes(x = SibSp, fill = factor(Survived)))+geom_bar(stat='count', position='dodge') + scale_x_continuous(breaks=c(0:8)) + labs(x = 'Sibsp')
#ggplot(data = train, aes(x = Parch, fill = factor(Survived)))+geom_bar(stat='count', position='dodge') + scale_x_continuous(breaks=c(0:6)) + labs(x = 'Parch')
Families <- train$SibSp +train$Parch
#ggplot(data = train, aes(x = Families, fill = factor(Survived)))+geom_bar(stat='count', position='dodge') + scale_x_continuous(breaks=c(0:10)) + labs(x = 'Families')
family_label=seq(1:nrow(train))
for (i in 1:nrow(train)) {
if(Families[i]<=3 & Families[i]>=1){family_label[i]=1}
if((Families[i]>3 & Families[i]<=6) | (Families[i]==0)){family_label[i]=2}
if(Families[i]>6){family_label[i]=3}
}
train=data.frame(train,'family_label'=family_label)
#windows()
#fl_S <- table(train$Survived, train$family_label)
#fl_S_prop <- prop.table(fl_S, 2)
#ggplot(data = train, aes(x = family_label, fill = factor(Survived)))+geom_bar(stat='count', position='dodge') + scale_x_continuous(breaks=c(1:3)) + labs(x = 'family_label')
Families <- alldata$SibSp + alldata$Parch
family_label=seq(1:nrow(alldata))
for (i in 1:nrow(alldata)) {
if(Families[i]<=3 & Families[i]>=1){family_label[i]=1}
if((Families[i]>3 & Families[i]<=6) | (Families[i]==0)){family_label[i]=2}
if(Families[i]>6){family_label[i]=3}
}
alldata=data.frame(alldata,'family_label'=family_label)
#fare
#windows()
#plot(density(train[train$Survived==0,]$Fare),main='fare',col='red')
#lines(density(train[train$Survived==1,]$Fare),col='blue')
#legend('topright',legend=c('Survived=0','Survived=1'),col=c("red","blue"),lty=1,lwd=2)
#carbin
#缺失值填充初始化
train[train$Cabin=="",]$Cabin="U"
alldata[alldata$Cabin=="",]$Cabin="U"
split_Initials<-function(df){
tmp=seq(1:nrow(df))
for(i in 1:nrow(df)){
tmp[i]=unlist(strsplit(df$Cabin[i],""))[1]
}
return(tmp)
}
deck=split_Initials(train)
train=data.frame(train,'deck'=deck)
deck=split_Initials(alldata)
alldata=data.frame(alldata,'deck'=deck)
#windows()
#deck_S <- table(train$Survived, train$deck)
#deck_S_prop <- prop.table(deck_S, 2)
#ggplot(data = train, aes(x = deck, fill = factor(Survived)))+geom_bar(stat='count', position='dodge')
#name
title_list=list(c(' Capt', ' Col', ' Major', ' Dr', ' Rev'),
c(' Don', ' Sir', ' the Countess', ' Dona', ' Lady'),
c(' Mme', ' Ms', ' Mrs'),c(' Mlle', ' Miss'),
" Mr",
c(' Master',' Jonkheer'))
names(title_list) <- c("Officer", "Royalty", "Mrs", "Miss", "Mr", "Master")
key_map<-function(list=title_list,ele){
for(i in 1:length(list)){
if(ele %in% list[[i]]){map_name=names(list)[i]}
}
return(map_name)
}
split_name<-function(df){
surname=seq(1:nrow(df))
title=seq(1:nrow(df))
for(i in 1:nrow(df)){
tmp=unlist(strsplit(df$Name[i],","))
surname[i]=tmp[1]
tmp1=tmp[2]
title[i]=unlist(strsplit(tmp1,"[.]"))[1]
title[i]=key_map(ele=title[i])
}
return(data.frame("surname"=surname,"title"=title))
}
tmp=split_name(train)
train=data.frame(train,tmp)
tmp=split_name(alldata)
alldata=data.frame(alldata,tmp)
#windows()
#title_S <- table(train$Survived, train$title)
#title_S_prop <- prop.table(title_S, 2)
#ggplot(data = train, aes(x = title, fill = factor(Survived)))+geom_bar(stat='count', position='dodge')
#embark
#windows()
#Embarked_S <- table(train$Survived, train$Embarked)
#Embarked_S_prop <- prop.table(Embarked_S, 2)
#ggplot(data = train, aes(x = Embarked, fill = factor(Survived)))+geom_bar(stat='count', position='dodge')
#====================================特征工程===============================
#缺失值处理
#alldata_cor=alldata[,-c(4,7,8,9,11,15)]
#windows()
#pairs.panels(alldata_cor[,-c(1,2)])#可视化
#embark
alldata[alldata$Embarked=="",]$Embarked="C"
#fare
tmp=alldata[alldata$Pclass==3,]
alldata[is.na(alldata$Fare),]$Fare=median(tmp[tmp$Embarked=="S",]$Fare,na.rm=T)
#alldata[is.na(alldata$Fare),]
#age
alldata_age=read.csv("C:/Users/秦文昭/Documents/python/titanic/alldata_age.csv",header=T,stringsAsFactors = FALSE)
alldata$Age=alldata_age$Age
#sum(is.na(alldata$Age))
rm(alldata_age)
rm(tmp)
#======================================model=============================
library(foreach)
library(caret)#这两个是做交叉验证用的
library(pROC)
alldata=alldata[,-c(4,7,8,9,11,15)]
alldata[,c(5,6)]=scale(alldata[,c(5,6)],scale=F)#2个连续型变量做归一化处理
#哑变量
class_sex=class.ind(alldata$Sex)
class_embarked=class.ind(alldata$Embarked)
names(class_embarked)<-c("embarked_C","embarked_Q","embarked_S")
class_family=class.ind(alldata$family_label)
names(class_family)<-c("fl1","fl2","fl3")
class_deck=class.ind(alldata$deck)
class_title=class.ind(alldata$title)
alldata=data.frame(alldata[,c(1,2,3,5,6,8)],class_sex,class_embarked,class_family,
class_deck,class_title)
#r哑变量
id=alldata[,1]
alldata=alldata[,-1]
test=alldata[is.na(alldata$Survived),]
train=alldata[1:891,]
seperate<-function(y,alpha=0.5){#如果有模型求出来是概率,就用这个分
for (i in 1:length(y)) {
if(y[i]<0.5){y[i]=0
}else{y[i]=1}
}
return(y)
}
#========================================knn==============================
#划分数据集
set.seed(666)
index<-createDataPartition(y=train$Survived,p=0.8,list=FALSE)
survived_train<-train[index,]
survived_test<-train[-index,]
library(class)
accuracy <- vector()
for(i in 1:30){
KNN_M <- knn(train = survived_train[, -1], test = survived_test[,-1], cl = survived_train[, 1], k = i)
CT <- table(survived_test$Survived, KNN_M)
accuracy <- c(accuracy, sum(diag(CT))/sum(CT)*100)
}
max(accuracy)
which.max(accuracy)
set.seed(123)
random <- sample(1:891, 104)
result_KNN_R <- train[random, 1]
accuracy_KNN_R <- vector()
for(i in 1:30){
KNN_M <- knn(train = train[-random, -1], test = train[random, -1], cl = train[-random, 1], k = i)
CT_KNN_R <- table(result_KNN_R, KNN_M)
accuracy_KNN_R <- c(accuracy_KNN_R, sum(diag(CT_KNN_R))/sum(CT_KNN_R)*100)
}
accuracy_KNN_R_Max <- max(accuracy_KNN_R)
accuracy_KNN_R_Max
which.max(accuracy_KNN_R)
#=================================