本文实例总结了Python常见的pandas用法。分享给大家供大家参考,具体如下:
'''
想要学习Python?Python学习交流群:984632579满足你的需求,资料都已经上传群文件,可以自行下载!
'''
import numpy as np
import pandas as pd
s = pd.Series([1,3,6, np.nan, 44, 1]) #定义一个序列。 序列就是一列内容,每一行有一个index值
print(s)
print(s.index)
0 1.0
1 3.0
2 6.0
3 NaN
4 44.0
5 1.0
dtype: float64
RangeIndex(start=0, stop=6, step=1)
dates = pd.date_range('20180101', periods=6)
print(dates)
DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04', '2018-01-05', '2018-01-06'], dtype='datetime64[ns]', freq='D')
df1 = pd.DataFrame(np.arange(12).reshape(3,4)) #定义DataFrame,可以看作一个有index和colunms的矩阵
print(df)
0 1 2 3 0 0 1 2 3 1 4 5 6 7 2 8 9 10 11
?
1
2
df2 = pd.DataFrame(np.random.randn(6,4), index=dates, columns=['a', 'b', 'c', 'd']) #np.random.randn(6,4)生成6行4列矩阵
print(df)
a b c d 2018-01-01 0.300675 1.769383 1.244406 -1.058294 2018-01-02 0.832666 2.216755 0.178716 -0.156828 2018-01-03 1.314190 -0.866199 0.836150 1.001026 2018-01-04 -1.671724 1.147406 -0.148676 -0.272555 2018-01-05 1.146664 2.022861 -1.833995 -0.627568 2018-01-06 -0.192242 1.517676 0.756707 0.058869
?
1
2
3
4
5
6
7
8
9
df = pd.DataFrame({'A':1.0,
'B':pd.Timestamp('20180101'),
'C':pd.Series(1, index=list(range(4)), dtype='float32'),
'D':np.array([3] * 4, dtype='int32'),
'E':pd.Categorical(['test', 'train', 'test', 'train']),
'F':'foo'}) #按照给出的逐列定义df
print(df)
print(df.dtypes)
A B C D E F 0 1.0 2018-01-01 1.0 3 test foo 1 1.0 2018-01-01 1.0 3 train foo 2 1.0 2018-01-01 1.0 3 test foo 3 1.0 2018-01-01 1.0 3 train foo A float64 B datetime64[ns] C float32 D int32 E category F object dtype: object
?
1
2
3
4
#df的行、列、值
print(df.index)
print(df.columns)
print(df.values)
Int64Index([0, 1, 2, 3], dtype='int64') Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object') [[1.0 Timestamp('2018-01-01 00:00:00') 1.0 3 'test' 'foo'] [1.0 Timestamp('2018-01-01 00:00:00') 1.0 3 'train' 'foo'] [1.0 Timestamp('2018-01-01 00:00:00') 1.0 3 'test' 'foo'] [1.0 Timestamp('2018-01-01 00:00:00') 1.0 3 'train' 'foo']]
?
1
2
print(df.describe()) #统计
print(df.T) #转置
A C D count 4.0 4.0 4.0 mean 1.0 1.0 3.0 std 0.0 0.0 0.0 min 1.0 1.0 3.0 25% 1.0 1.0 3.0 50% 1.0 1.0 3.0 75% 1.0 1.0 3.0 max 1.0 1.0 3.0 0 1 2 \ A 1 1 1 B 2018-01-01 00:00:00 2018-01-01 00:00:00 2018-01-01 00:00:00 C 1 1 1 D 3 3 3 E test train test F foo foo foo 3 A 1 B 2018-01-01 00:00:00 C 1 D 3 E train F foo
?
1
2
3
#df排序
print(df.sort_index(axis=1, ascending=False)) #根据索引值对各行进行排序(相当于重新排列各列的位置)
print(df.sort_values(by='E')) #根据内容值对各列进行排序
F E D C B A 0 foo test 3 1.0 2018-01-01 1.0 1 foo train 3 1.0 2018-01-01 1.0 2 foo test 3 1.0 2018-01-01 1.0 3 foo train 3 1.0 2018-01-01 1.0 A B C D E F 0 1.0 2018-01-01 1.0 3 test foo 2 1.0 2018-01-01 1.0 3 test foo 1 1.0 2018-01-01 1.0 3 train foo 3 1.0 2018-01-01 1.0 3 train foo
?
1
2
3
4
5
6
7
8
indexes = pd.date_range('20180101', periods=6)
df3 = pd.DataFrame(np.arange(24).reshape(6, 4), index=indexes, columns=['A', 'B', 'C', 'D'])
print(df3)
print()
#选择column
print(df3['A'])
print()
print(df3.A)
A B C D 2018-01-01 0 1 2 3 2018-01-02 4 5 6 7 2018-01-03 8 9 10 11 2018-01-04 12 13 14 15 2018-01-05 16 17 18 19 2018-01-06 20 21 22 23 2018-01-01 0 2018-01-02 4 2018-01-03 8 2018-01-04 12 2018-01-05 16 2018-01-06 20 Freq: D, Name: A, dtype: int32 2018-01-01 0 2018-01-02 4 2018-01-03 8 2018-01-04 12 2018-01-05 16 2018-01-06 20 Freq: D, Name: A, dtype: int32 A B C D 2018-01-01 0 1 2 3 2018-01-02 4 5 6 7 2018-01-03 8 9 10 11
?
1
2
3
4
5
6
#选择行, 类似limit语句
print(df3[0:0])
print()
print(df3[0:3])
print()
print(df3['20180103':'20180105'])
Empty DataFrame Columns: [A, B, C, D] Index: [] A B C D 2018-01-01 0 1 2 3 2018-01-02 4 5 6 7 2018-01-03 8 9 10 11 A B C D 2018-01-03 8 9 10 11 2018-01-04 12 13 14 15 2018-01-05 16 17 18 19
?
1
print(df3.loc['20180102']) #返回指定行构成的序列
A 4 B 5 C 6 D 7 Name: 2018-01-02 00:00:00, dtype: int32
?
1
2
3
4
5
print(df3.loc['20180103', ['A','C']]) #列筛选
print()
print(df3.loc['20180103':'20180105', ['A','C']]) #子df,类似select A, C from df limit ...
print()
print(df3.loc[:, ['A', 'B']])
A 8 C 10 Name: 2018-01-03 00:00:00, dtype: int32 A C 2018-01-03 8 10 2018-01-04 12 14 2018-01-05 16 18 A B 2018-01-01 0 1 2018-01-02 4 5 2018-01-03 8 9 2018-01-04 12 13 2018-01-05 16 17 2018-01-06 20 21
?
1
2
3
4
5
6
print(df3);print()
print(df3.iloc[1]);print()
print(df3.iloc[1,1]);print()
print(df3.iloc[:,1]);print()
print(df3.iloc[0:3,1:3]);print()
print(df3.iloc[[1,3,5],[0,2]]) #行可以不连续,limit做不到
A B C D 2018-01-01 0 1 2 3 2018-01-02 4 5 6 7 2018-01-03 8 9 10 11 2018-01-04 12 13 14 15 2018-01-05 16 17 18 19 2018-01-06 20 21 22 23 A 4 B 5 C 6 D 7 Name: 2018-01-02 00:00:00, dtype: int32 5 2018-01-01 1 2018-01-02 5 2018-01-03 9 2018-01-04 13 2018-01-05 17 2018-01-06 21 Freq: D, Name: B, dtype: int32 B C 2018-01-01 1 2 2018-01-02 5 6 2018-01-03 9 10 A C 2018-01-02 4 6 2018-01-04 12 14 2018-01-06 20 22
?
1
2
3
# print(df3.ix[:3, ['A', 'C']])\
print(df3);print()
print(df3[df3.A >= 8]) #根据值进行条件过滤,类似where A >= 8条件语句
A B C D 2018-01-01 0 1 2 3 2018-01-02 4 5 6 7 2018-01-03 8 9 10 11 2018-01-04 12 13 14 15 2018-01-05 16 17 18 19 2018-01-06 20 21 22 23 A B C D 2018-01-03 8 9 10 11 2018-01-04 12 13 14 15 2018-01-05 16 17 18 19 2018-01-06 20 21 22 23
?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
indexes1 = pd.date_range('20180101', periods=6)
df4 = pd.DataFrame(np.arange(24).reshape(6, 4), index=indexes1, columns=['A', 'B', 'C', 'D'])
print(df4);print()
#给某个元素赋值
df4.A[1] = 1111
df4.B['20180103'] = 2222
df4.iloc[3, 2] = 3333
df4.loc['20180105', 'D'] = 4444
print(df4);print()
#范围赋值
df4.B[df4.A < 10] = -1
print(df4);print()
df4[df4.A < 10] = 0
print(df4);print()
A B C D 2018-01-01 0 1 2 3 2018-01-02 4 5 6 7 2018-01-03 8 9 10 11 2018-01-04 12 13 14 15 2018-01-05 16 17 18 19 2018-01-06 20 21 22 23 A B C D 2018-01-01 0 1 2 3 2018-01-02 1111 5 6 7 2018-01-03 8 2222 10 11 2018-01-04 12 13 3333 15 2018-01-05 16 17 18 4444 2018-01-06 20 21 22 23 A B C D 2018-01-01 0 -1 2 3 2018-01-02 1111 5 6 7 2018-01-03 8 -1 10 11 2018-01-04 12 13 3333 15 2018-01-05 16 17 18 4444 2018-01-06 20 21 22 23 A B C D 2018-01-01 0 0 0 0 2018-01-02 1111 5 6 7 2018-01-03 0 0 0 0 2018-01-04 12 13 3333 15 2018-01-05 16 17 18 4444 2018-01-06 20 21 22 23
?
1
2
3
4
5
6
7
8
9
10
indexes1 = pd.date_range('20180101', periods=6)
df4 = pd.DataFrame(np.arange(24).reshape(6, 4), index=indexes1, columns=['A', 'B', 'C', 'D'])
print(df4);print()
#添加一列
df4['E'] = np.NaN
print(df4);print()
#由于index没对齐,原df没有的行默认为NaN,类型为float64,多出的行丢弃
df4['F'] = pd.Series([1,2,3,4,5,6], index=pd.date_range('20180102', periods=6))
print(df4);print()
print(df4.dtypes)
A B C D 2018-01-01 0 1 2 3 2018-01-02 4 5 6 7 2018-01-03 8 9 10 11 2018-01-04 12 13 14 15 2018-01-05 16 17 18 19 2018-01-06 20 21 22 23 A B C D E 2018-01-01 0 1 2 3 NaN 2018-01-02 4 5 6 7 NaN 2018-01-03 8 9 10 11 NaN 2018-01-04 12 13 14 15 NaN 2018-01-05 16 17 18 19 NaN 2018-01-06 20 21 22 23 NaN A B C D E F 2018-01-01 0 1 2 3 NaN NaN 2018-01-02 4 5 6 7 NaN 1.0 2018-01-03 8 9 10 11 NaN 2.0 2018-01-04 12 13 14 15 NaN 3.0 2018-01-05 16 17 18 19 NaN 4.0 2018-01-06 20 21 22 23 NaN 5.0 A int32 B int32 C int32 D int32 E float64 F float64 dtype: object
?
1
2
3
4
5
6
7
8
9
10
11
12
df_t = pd.DataFrame(np.arange(24).reshape(6, 4), index=[1,2,3,4,5,6], columns=['A', 'B', 'C', 'D'])
df_t.iloc[0, 1] = np.NaN
df_t.iloc[1, 2] = np.NaN
df = df_t.copy()
print(df);print()
print(df.dropna(axis=0, how='any'));print()
df = df_t.copy()
print(df.dropna(axis=1, how='any'));print()
df = df_t.copy()
df.C = np.NaN
print(df);print()
print(df.dropna(axis=1, how='all'));print()
A B C D 1 0 NaN 2.0 3 2 4 5.0 NaN 7 3 8 9.0 10.0 11 4 12 13.0 14.0 15 5 16 17.0 18.0 19 6 20 21.0 22.0 23 A B C D 3 8 9.0 10.0 11 4 12 13.0 14.0 15 5 16 17.0 18.0 19 6 20 21.0 22.0 23 A D 1 0 3 2 4 7 3 8 11 4 12 15 5 16 19 6 20 23 A B C D 1 0 NaN NaN 3 2 4 5.0 NaN 7 3 8 9.0 NaN 11 4 12 13.0 NaN 15 5 16 17.0 NaN 19 6 20 21.0 NaN 23 A B D 1 0 NaN 3 2 4 5.0 7 3 8 9.0 11 4 12 13.0 15 5 16 17.0 19 6 20 21.0 23
?
1
2
3
4
5
6
7
df = df_t.copy()
print(df);print()
print(df.isna());print()
print(df.isnull().any());print() #isnull是isna别名,功能一样
print(df.isnull().any(axis=1));print()
print(np.any(df.isna() == True));print()
print(df.fillna(value=0)) #将NaN赋值
A B C D 1 0 NaN 2.0 3 2 4 5.0 NaN 7 3 8 9.0 10.0 11 4 12 13.0 14.0 15 5 16 17.0 18.0 19 6 20 21.0 22.0 23 A B C D 1 False True False False 2 False False True False 3 False False False False 4 False False False False 5 False False False False 6 False False False False A False B True C True D False dtype: bool 1 True 2 True 3 False 4 False 5 False 6 False dtype: bool True A B C D 1 0 0.0 2.0 3 2 4 5.0 0.0 7 3 8 9.0 10.0 11 4 12 13.0 14.0 15 5 16 17.0 18.0 19 6 20 21.0 22.0 23
?
1
2
3
data = pd.read_csv('D:/pythonwp/test/student.csv')
print(data)
data.to_pickle('D:/pythonwp/test/student.pickle')
id name age gender 0 1 牛帅 23 Male 1 2 gyb 89 Male 2 3 xxs 27 Male 3 4 hey 24 Female 4 5 奥莱利赫本 66 Female 5 6 Jackson 61 Male 6 7 牛帅 23 Male
?
1
2
3
4
5
6
7
8
9
10
df0 = pd.DataFrame(np.ones((3, 4)) * 0, columns=['A', 'B', 'C', 'D'])
df1 = pd.DataFrame(np.ones((3, 4)) * 1, columns=['A', 'B', 'C', 'D'])
df2 = pd.DataFrame(np.ones((3, 4)) * 2, columns=['A', 'B', 'C', 'D'])
print(df0); print()
print(df1); print()
print(df2); print()
res = pd.concat([df0, df1, df2], axis = 0)
print(res); print()
res = pd.concat([df0, df1, df2], axis = 0, ignore_index=True)
print(res)
A B C D 0 0.0 0.0 0.0 0.0 1 0.0 0.0 0.0 0.0 2 0.0 0.0 0.0 0.0 A B C D 0 1.0 1.0 1.0 1.0 1 1.0 1.0 1.0 1.0 2 1.0 1.0 1.0 1.0 A B C D 0 2.0 2.0 2.0 2.0 1 2.0 2.0 2.0 2.0 2 2.0 2.0 2.0 2.0 A B C D 0 0.0 0.0 0.0 0.0 1 0.0 0.0 0.0 0.0 2 0.0 0.0 0.0 0.0 0 1.0 1.0 1.0 1.0 1 1.0 1.0 1.0 1.0 2 1.0 1.0 1.0 1.0 0 2.0 2.0 2.0 2.0 1 2.0 2.0 2.0 2.0 2 2.0 2.0 2.0 2.0 A B C D 0 0.0 0.0 0.0 0.0 1 0.0 0.0 0.0 0.0 2 0.0 0.0 0.0 0.0 3 1.0 1.0 1.0 1.0 4 1.0 1.0 1.0 1.0 5 1.0 1.0 1.0 1.0 6 2.0 2.0 2.0 2.0 7 2.0 2.0 2.0 2.0 8 2.0 2.0 2.0 2.0
?
1
2
3
4
5
6
7
8
df0 = pd.DataFrame(np.ones((3, 4)) * 0, columns=['A', 'B', 'C', 'D'])
df1 = pd.DataFrame(np.ones((3, 4)) * 1, columns=['E', 'F', 'C', 'D'])
res = pd.concat([df0, df1], ignore_index=True)
print(res);print()
res = pd.concat([df0, df1], join='outer', ignore_index=True)
print(res);print()
res = pd.concat([df0, df1], join='inner',ignore_index=True)
print(res);print()
A B C D E F 0 0.0 0.0 0.0 0.0 NaN NaN 1 0.0 0.0 0.0 0.0 NaN NaN 2 0.0 0.0 0.0 0.0 NaN NaN 3 NaN NaN 1.0 1.0 1.0 1.0 4 NaN NaN 1.0 1.0 1.0 1.0 5 NaN NaN 1.0 1.0 1.0 1.0 A B C D E F 0 0.0 0.0 0.0 0.0 NaN NaN 1 0.0 0.0 0.0 0.0 NaN NaN 2 0.0 0.0 0.0 0.0 NaN NaN 3 NaN NaN 1.0 1.0 1.0 1.0 4 NaN NaN 1.0 1.0 1.0 1.0 5 NaN NaN 1.0 1.0 1.0 1.0 C D 0 0.0 0.0 1 0.0 0.0 2 0.0 0.0 3 1.0 1.0 4 1.0 1.0 5 1.0 1.0
?
1
2
3
4
5
6
7
8
9
10
11
#横向合并
df0 = pd.DataFrame(np.ones((3, 4)) * 0, index=['1', '2', '3'], columns=['A', 'B', 'C', 'D'])
df1 = pd.DataFrame(np.ones((3, 4)) * 1, index=['2', '3', '4'], columns=['A', 'B', 'C', 'D'])
print(df0);print()
print(df1);print()
res = pd.concat([df0, df1], axis=1)
print(res);print()
res = pd.concat([df0, df1], axis=1, join='inner', ignore_index=True)
print(res);print()
res = pd.concat([df0, df1], axis=1, join_axes=[df0.index])
print(res);print()
A B C D 1 0.0 0.0 0.0 0.0 2 0.0 0.0 0.0 0.0 3 0.0 0.0 0.0 0.0 A B C D 2 1.0 1.0 1.0 1.0 3 1.0 1.0 1.0 1.0 4 1.0 1.0 1.0 1.0 A B C D A B C D 1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN 2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 4 NaN NaN NaN NaN 1.0 1.0 1.0 1.0 0 1 2 3 4 5 6 7 2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 A B C D A B C D 1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN 2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
?
1
2
3
4
5
6
7
8
df0 = pd.DataFrame(np.ones((3, 4)) * 0, index=['1', '2', '3'], columns=['A', 'B', 'C', 'D'])
df1 = pd.DataFrame(np.ones((3, 4)) * 1, index=['2', '3', '4'], columns=['A', 'B', 'C', 'D'])
print(df0);print()
print(df1);print()
res = df0.append([df1, df1], ignore_index=False)
print(res);print()
s = pd.Series([1,2,3,4], index=['A','B','C','E'])
print(df0.append(s, ignore_index=True))
A B C D 1 0.0 0.0 0.0 0.0 2 0.0 0.0 0.0 0.0 3 0.0 0.0 0.0 0.0 A B C D 2 1.0 1.0 1.0 1.0 3 1.0 1.0 1.0 1.0 4 1.0 1.0 1.0 1.0 A B C D 1 0.0 0.0 0.0 0.0 2 0.0 0.0 0.0 0.0 3 0.0 0.0 0.0 0.0 2 1.0 1.0 1.0 1.0 3 1.0 1.0 1.0 1.0 4 1.0 1.0 1.0 1.0 2 1.0 1.0 1.0 1.0 3 1.0 1.0 1.0 1.0 4 1.0 1.0 1.0 1.0 A B C D E 0 0.0 0.0 0.0 0.0 NaN 1 0.0 0.0 0.0 0.0 NaN 2 0.0 0.0 0.0 0.0 NaN 3 1.0 2.0 3.0 NaN 4.0
?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
df1 = pd.DataFrame({'key':['K0', 'K1', 'K2'],
'A':['A0', 'A1', 'A2'],
'B':['B0', 'B1', 'B2']})
df2 = pd.DataFrame({'key':['K3', 'K1', 'K2'],
'C':['C3', 'C1', 'C2'],
'D':['D3', 'D1', 'D2']})
print(df1); print()
print(df2); print()
res = pd.merge(df1, df2, on='key')
print(res); print()
res = pd.merge(df1, df2, on='key', how='outer')
print(res); print()
res = pd.merge(df1, df2, on='key', how='left')
print(res); print()
res = pd.merge(df1, df2, on='key', how='right')
print(res); print()
A B key 0 A0 B0 K0 1 A1 B1 K1 2 A2 B2 K2 C D key 0 C3 D3 K3 1 C1 D1 K1 2 C2 D2 K2 A B key C D 0 A1 B1 K1 C1 D1 1 A2 B2 K2 C2 D2 A B key C D 0 A0 B0 K0 NaN NaN 1 A1 B1 K1 C1 D1 2 A2 B2 K2 C2 D2 3 NaN NaN K3 C3 D3 A B key C D 0 A0 B0 K0 NaN NaN 1 A1 B1 K1 C1 D1 2 A2 B2 K2 C2 D2 A B key C D 0 A1 B1 K1 C1 D1 1 A2 B2 K2 C2 D2 2 NaN NaN K3 C3 D3
?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
df1 = pd.DataFrame({'key1':['K0', 'K0', 'K1'],
'key2':['K0', 'K1', 'K1'],
'A':['A0', 'A1', 'A2'],
'B':['B0', 'B1', 'B2']})
df2 = pd.DataFrame({'key1':['K0', 'K0', 'K1', 'K2'],
'key2':['K0', 'K0', 'K1', 'K2'],
'C':['C3', 'C1', 'C2', 'C4'],
'D':['D3', 'D1', 'D2', 'D4']})
print(df1); print()
print(df2); print()
res = pd.merge(df1, df2, on=['key1','key2'])
print(res); print()
res = pd.merge(df1, df2, on=['key1','key2'], how='outer', indicator='indi')
print(res); print()
A B key1 key2 0 A0 B0 K0 K0 1 A1 B1 K0 K1 2 A2 B2 K1 K1 C D key1 key2 0 C3 D3 K0 K0 1 C1 D1 K0 K0 2 C2 D2 K1 K1 3 C4 D4 K2 K2 A B key1 key2 C D 0 A0 B0 K0 K0 C3 D3 1 A0 B0 K0 K0 C1 D1 2 A2 B2 K1 K1 C2 D2 A B key1 key2 C D indi 0 A0 B0 K0 K0 C3 D3 both 1 A0 B0 K0 K0 C1 D1 both 2 A1 B1 K0 K1 NaN NaN left_only 3 A2 B2 K1 K1 C2 D2 both 4 NaN NaN K2 K2 C4 D4 right_only
?
1
2
3
4
5
6
7
8
9
10
11
12
13
#以上是根据值合并。下面根据index合并
df1 = pd.DataFrame({'A':['A0', 'A1', 'A2'],
'B':['B0', 'B1', 'B2']},
index=['index0', 'index1', 'index2'])
df2 = pd.DataFrame({'A':['C3', 'C1', 'C2'],
'D':['D3', 'D1', 'D2']},
index=['index3', 'index1', 'index2'])
print(df1); print()
print(df2); print()
res = pd.merge(df1, df2, left_index=True, right_index=True)
print(res); print()
res = pd.merge(df1, df2, left_index=True, right_index=True, how='outer', suffixes=['_b', '_g'])
print(res); print()
A B index0 A0 B0 index1 A1 B1 index2 A2 B2 A D index3 C3 D3 index1 C1 D1 index2 C2 D2 A_x B A_y D index1 A1 B1 C1 D1 index2 A2 B2 C2 D2 A_b B A_g D index0 A0 B0 NaN NaN index1 A1 B1 C1 D1 index2 A2 B2 C2 D2 index3 NaN NaN C3 D3
?
1
2
3
4
res = df1.join(df2, how='outer', lsuffix='_left', rsuffix='_right') #不用on默认用索引合并
print(res);print()
res = df1.join(df2, on='B', how='outer', lsuffix='_left', rsuffix='_right') #用on指定df1的某列和df2的索引合并
print(res);print()
A_left B A_right D index0 A0 B0 NaN NaN index1 A1 B1 C1 D1 index2 A2 B2 C2 D2 index3 NaN NaN C3 D3 A_left B A_right D index0 A0 B0 NaN NaN index1 A1 B1 NaN NaN index2 A2 B2 NaN NaN index2 NaN index3 C3 D3 index2 NaN index1 C1 D1 index2 NaN index2 C2 D2
?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt #画图模块
s = pd.Series(np.random.randn(1000), index=np.arange(1000))
s = s.cumsum()
#须在命令行执行, jupyter会报错
#s.plot()
#plt.show()
df = pd.DataFrame(np.random.randn(1000, 3), columns=['A', 'B', 'C'])
df = df.cumsum()
print(df.head()); print() #head默认显示前5行
#须在命令行执行, jupyter会报错
#s.plot()
#plt.show()
#须在命令行执行, jupyter会报错
#'bar', 'hist', 'box', 'kde', 'area', 'scatter', 'hexbin', 'pie'...
#class_B = df.plot.scatter(x='A', y='B', color='DarkBlue', label='Class B') #画图,scatter
#df.plot.scatter(x='A', y='C', color='DarkRed', label='Class C', class_B=class_B)
#plt.show()
A B C 0 -0.399363 -1.004210 0.641141 1 -1.970009 -0.608482 -0.758504 2 -3.081640 -0.617352 -1.143872 3 -2.174627 -1.383785 -1.011411 4 -1.415515 -1.892226 -2.511739
