相关: Python数据分析招式:pandas库提取清洗排序-1
要点:
- 数据的字符处理
- 数据的过滤
- 数据的分组
- 数据的透视表
# -*- coding: utf-8 -*-
# @File : 数据集的处理.py
# @Date : 2018-06-03
import pandas as pd
file = "data/train.csv"
df = pd.DataFrame(pd.read_csv(file))
print(df.head(3))
"""
PassengerId Survived Pclass ... Fare Cabin Embarked
0 1 0 3 ... 7.2500 NaN S
1 2 1 1 ... 71.2833 C85 C
2 3 1 3 ... 7.9250 NaN S
[3 rows x 12 columns]
"""
1、数据集的字符处理
# 对列的处理
mapping ={
'PassengerId': '乘客编号',
'Survived':'是否获救',
'Name':'姓名',
'Pclass':'船舱等级','Sex':'性别',
'Age':'年龄','SibSp':'兄弟姐妹数',
'Parch':'父母小孩数','Ticket':'船票',
'Fare':'船票费'
}
ret = df.rename(columns=mapping)
print(ret.head(3))
"""
乘客编号 是否获救 船舱等级 ... 船票费 Cabin Embarked
0 1 0 3 ... 7.2500 NaN S
1 2 1 1 ... 71.2833 C85 C
2 3 1 3 ... 7.9250 NaN S
[3 rows x 12 columns]
"""
# 对数据集里面的特定字符串进行替换
ret = df['Sex'].map({'female':'女','male':'男'})
print(ret.head(3))
"""
[3 rows x 12 columns]
0 男
1 女
2 女
Name: Sex, dtype: object
"""
# 对列的字符进行替换, 只保留数字部分
# contains,split,match,findall,endswith
df['Embarked']=df['Embarked'].replace(regex='[CS]', value='xxx')
print(df.head(3))
"""
PassengerId Survived Pclass ... Fare Cabin Embarked
0 1 0 3 ... 7.2500 NaN xxx
1 2 1 1 ... 71.2833 C85 xxx
2 3 1 3 ... 7.9250 NaN xxx
[3 rows x 12 columns]
"""
2、数据集的过滤
# 用逻辑表达式组合过滤 ==, !=
ret = df[(df['Sex']=='female')&(df['Age']>10)]
print(ret.head(3))
"""
PassengerId Survived Pclass ... Fare Cabin Embarked
1 2 1 1 ... 71.2833 C85 xxx
2 3 1 3 ... 7.9250 NaN xxx
3 4 1 1 ... 53.1000 C123 xxx
[3 rows x 12 columns]
"""
# query函数
ret = df.query('Age==[10, 20]')
print(ret[["Name", "Age"]].head(3))
"""
Name Age
12 Saundercock, Mr. William Henry 20.0
91 Andreasson, Mr. Paul Edvin 20.0
113 Jussila, Miss. Katriina 20.0
"""
3、数据的分类
# 用where函数
import numpy as np
ret=np.where(df['Age']>=18)
# apply函数
def convert_age(age):
if age> 0 and age < 10:
return "小孩"
elif age < 30:
return "大人"
else:
return "老人"
df["年龄分类"] = df['Age'].apply(convert_age)
print(df[["Name", "Age", "年龄分类"]].sample(3))
"""
Name Age 年龄分类
624 Bowen, Mr. David John "Dai" 21.0 大人
880 Shelley, Mrs. William (Imanita Parrish Hall) 25.0 大人
471 Cacic, Mr. Luka 38.0 老人
"""
4、 数据的切片和透视表
# groupby函数
print(df.groupby('Sex')[['Name', 'Sex']].count())
"""
Name Sex
Sex
female 314 314
male 577 577
"""
# 对数据进行轴切片分析
ret = df.groupby(['Survived','Pclass'])['Age'].agg(['size','max','min','mean'])
print(ret)
"""
Survived Pclass
0 1 80 71.0 2.00 43.695312
2 97 70.0 16.00 33.544444
3 372 74.0 1.00 26.555556
1 1 136 80.0 0.92 35.368197
2 87 62.0 0.67 25.901566
3 119 63.0 0.42 20.646118
"""
# 数据透视表
ret = df.pivot_table(columns=['Sex'],index=['Survived','Pclass'],values='Age',
aggfunc={'Age':[np.mean,min,max]})
print(ret)
"""
max mean min
Sex female male female male female male
Survived Pclass
0 1 50.0 71.0 25.666667 44.581967 2.00 18.00
2 57.0 70.0 36.000000 33.369048 24.00 16.00
3 48.0 74.0 23.818182 27.255814 2.00 1.00
1 1 63.0 80.0 34.939024 36.248000 14.00 0.92
2 55.0 62.0 28.080882 16.022000 2.00 0.67
3 63.0 45.0 19.329787 22.274211 0.75 0.42
"""
参考:
11招,高手都在用的数据分析招式