要点:
- 数据的基本处理
- 数据的提取
- 数据的初步清洗
- 数据的排序
泰坦尼克数据集下载地址: 地址1(需要注册): https://www.kaggle.com/c/titanic/data 地址2(百度网盘): https://pan.baidu.com/s/1Vp0QmVLu43_Hb9jHR2FKXg 密码: rdfr
导入数据# -*- coding: utf-8 -*-
# @File : 泰坦尼克数据分析.py
# @Date : 2018-06-03
import numpy as np
import pandas as pd
file = "data/train.csv"
df = pd.DataFrame(pd.read_csv(file))
1、数据的基本处理
# 形状
print(df.shape)
# (891, 12)
# 查看前3行
print(df.head(3))
"""
PassengerId Survived Pclass ... Fare Cabin Embarked
0 1 0 3 ... 7.2500 NaN S
1 2 1 1 ... 71.2833 C85 C
2 3 1 3 ... 7.9250 NaN S
[3 rows x 12 columns]
"""
# 查看后3行
print(df.tail(3))
"""
PassengerId Survived Pclass ... Fare Cabin Embarked
888 889 0 3 ... 23.45 NaN S
889 890 1 1 ... 30.00 C148 C
890 891 0 3 ... 7.75 NaN Q
[3 rows x 12 columns]
"""
# 信息
print(df.info())
"""
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null object
Age 714 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 204 non-null object
Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None
"""
# 整体描述
print(df.describe())
"""
PassengerId Survived ... Parch Fare
count 891.000000 891.000000 ... 891.000000 891.000000
mean 446.000000 0.383838 ... 0.381594 32.204208
std 257.353842 0.486592 ... 0.806057 49.693429
min 1.000000 0.000000 ... 0.000000 0.000000
25% 223.500000 0.000000 ... 0.000000 7.910400
50% 446.000000 0.000000 ... 0.000000 14.454200
75% 668.500000 1.000000 ... 0.000000 31.000000
max 891.000000 1.000000 ... 6.000000 512.329200
[8 rows x 7 columns]
"""
# 查看数据集的空值
print(df.isnull().sum())
"""
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
"""
# 唯一值
print(df["Pclass"].unique())
# [3 1 2]
2、数据的提取
# 按照索引的值提取数据
print(df.loc[630])
"""
PassengerId 631
Survived 1
Pclass 1
Name Barkworth, Mr. Algernon Henry Wilson
Sex male
Age 80
SibSp 0
Parch 0
Ticket 27042
Fare 30
Cabin A23
Embarked S
Name: 630, dtype: object
"""
# 取部分行和列 第二三四行和前5列
print(df.iloc[2:5, :5])
"""
PassengerId ... Sex
2 3 ... female
3 4 ... female
4 5 ... male
[3 rows x 5 columns]
"""
# 照条件提取 仓位为小于2的,并且性别为女性的数据
print(df[(df["Pclass"]int
print(ret.dtypes) # int32
# 更改列的名字
ret = df.rename(columns={'Survived':'是否获救'})
print(ret.head(3))
"""
PassengerId 是否获救 Pclass ... Fare Cabin Embarked
0 1 0 三等舱 ... 7.2500 NaN S
1 2 1 一等舱 ... 71.2833 C85 C
2 3 1 三等舱 ... 7.9250 NaN S
[3 rows x 12 columns]
"""
# 去掉重复值
# #比如我们想知道登船的类别,去掉所有重复的数据
ret = df['Embarked'].drop_duplicates()
print(ret)
"""
0 S
1 C
5 Q
61 NaN
Name: Embarked, dtype: object
"""
# 数据的代替,替换
df['Sex']=df['Sex'].replace('male','男')
print(df["Sex"].head(3))
"""
0 男
1 female
2 female
Name: Sex, dtype: object
"""
4、数据的排序
# 按照年龄进行降序排列
print(df.sort_values(by=['Age'],ascending=False)["Age"].head(3))
"""
630 80.0
851 74.0
493 71.0
Name: Age, dtype: float64
"""
# 按照index来排序
print(df.sort_index(ascending=False).head(3))
"""
PassengerId Survived Pclass ... Fare Cabin Embarked
890 891 0 三等舱 ... 7.75 NaN Q
889 890 1 一等舱 ... 30.00 C148 C
888 889 0 三等舱 ... 23.45 NaN S
"""
参考文章: 18招,小白必看的数据分析招式|精选上篇