python气象信息系统工程 第四章

第4章 pandas:优秀的数据分析工具

4.2 pd.Series——序列

4.2.1 创建序列

1
2
3
4
# 创建序列
import pandas as pd
a = pd.Series([9.1, 9.5, 10.0, 11.2])
print(a)
0     9.1
1     9.5
2    10.0
3    11.2
dtype: float64
1
2
3
4
# 创建时指定索引
import pandas as pd
a = pd.Series([9.1, 9.5, 10.0, 11.2], index=['a', 'b', 'c', 'd'])
print(a)
a     9.1
b     9.5
c    10.0
d    11.2
dtype: float64
1
2
3
4
# 创建时指定名称
import pandas as pd
a = pd.Series([9.1, 9.5, 10.0, 11.2], index=['a', 'b', 'c', 'd'], name='t')
print(a)
a     9.1
b     9.5
c    10.0
d    11.2
Name: t, dtype: float64

4.2.2 时间索引

1
2
3
4
5
6
7
8
9
# 为时间序列准备的统计计算方法
import pandas as pd
import numpy as np
a = pd.Series([9.1, 9.5, 10.0, 11.2],
index=pd.to_datetime(['2020-02-19', '2020-02-20',
'2020-02-21', '2020-02-22']),
name='t')
print(a)
print(a.index.astype(np.int))
2020-02-19     9.1
2020-02-20     9.5
2020-02-21    10.0
2020-02-22    11.2
Name: t, dtype: float64
Int64Index([1582070400000000000, 1582156800000000000, 1582243200000000000,
            1582329600000000000],
           dtype='int64')


/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:9: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if __name__ == '__main__':
/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:9: FutureWarning: casting datetime64[ns] values to int64 with .astype(...) is deprecated and will raise in a future version. Use .view(...) instead.
  if __name__ == '__main__':
1
2
3
4
5
6
7
# 时区转换
import datetime as dt
import pandas as pd
index_time = pd.to_datetime(['2020-02-19', '2020-02-20', '2020-02-21', '2020-02-22'])
index_time = index_time - dt.timedelta(hours=8)
a = pd.Series([9.1, 9.5, 10.0, 11.2], index=index_time, name='t')
print(a)
2020-02-18 16:00:00     9.1
2020-02-19 16:00:00     9.5
2020-02-20 16:00:00    10.0
2020-02-21 16:00:00    11.2
Name: t, dtype: float64

4.2.3 pd.Series对象的算术运算

1
2
3
4
5
# 序列的标量运算
import pandas as pd
a = pd.Series([9.1, 9.5, 10.0, 11.2], index=['a', 'b', 'c', 'd'], name='t')
b = a - 10
print(b)
a   -0.9
b   -0.5
c    0.0
d    1.2
Name: t, dtype: float64
1
2
3
4
5
6
7
8
9
10
11
12
13
# 按照索引的运算
import pandas as pd
a = pd.Series([9.1, 9.5, 10.0, 11.2], index=['a', 'b', 'c', 'd'], name='t1')
b = pd.Series([9.1, 9.5, 10.0, 11.2], index=['b', 'a', 'c', 'd'], name='t2')
print(b - a)
import pandas as pd
a = pd.Series([9.1, 9.5, 10.0, 11.2], index=['a', 'b', 'c', 'd'], name='t1')
b = pd.Series([9.1, 9.5, 10.0, 11.2], index=['b', 'c', 'd', 'e'], name='t2')
print(b - a)
import pandas as pd
a = pd.Series([9.1, 9.5, 10.0, 11.2], index=['a', 'b', 'c', 'd'], name='t1')
b = pd.Series([9.1, 9.5, 10.0, 11.2], index=['e', 'f', 'g', 'h'], name='t2')
print(b - a)
a    0.4
b   -0.4
c    0.0
d    0.0
dtype: float64
a    NaN
b   -0.4
c   -0.5
d   -1.2
e    NaN
dtype: float64
a   NaN
b   NaN
c   NaN
d   NaN
e   NaN
f   NaN
g   NaN
h   NaN
dtype: float64

4.2.4 pd.Series对象的常用属性

1
2
3
4
# 序列的数据类型
import pandas as pd
a = pd.Series([9.1, 9.5, 10.0, 11.2], index=['a', 'b', 'c', 'd'], name='t')
print(a.dtype)
float64
1
2
3
4
# 序列的数据维度
import pandas as pd
a = pd.Series([9.1, 9.5, 10.0, 11.2], index=['a', 'b', 'c', 'd'], name='t')
print(a.ndim)
1
1
2
3
4
# 序列的数据形状
import pandas as pd
a = pd.Series([9.1, 9.5, 10.0, 11.2], index=['a', 'b', 'c', 'd'], name='t')
print(a.shape)
(4,)
1
2
3
4
# 通过标签索引访问单个元素
import pandas as pd
a = pd.Series([9.1, 9.5, 10.0, 11.2], index=['a', 'b', 'c', 'd'], name='t')
print(a.at['a'])
9.1
1
2
3
4
# 通过位置数值索引访问单个元素
import pandas as pd
a = pd.Series([9.1, 9.5, 10.0, 11.2], index=['a', 'b', 'c', 'd'], name='t')
print(a.iat[0])
9.1
1
2
3
4
5
6
# 通过标签列访问多个元素
import pandas as pd
a = pd.Series([9.1, 9.5, 10.0, 11.2], index=['a', 'b', 'c', 'd'], name='t')
print(a.loc['a'])
print('----------')
print(a.loc[['a', 'b']])
9.1
----------
a    9.1
b    9.5
Name: t, dtype: float64
1
2
3
4
# 通过布尔序列列访问多个元素
import pandas as pd
a = pd.Series([9.1, 9.5, 10.0, 11.2], index=['a', 'b', 'c', 'd'], name='t')
print(a.loc[[True, False, True, False]])
a     9.1
c    10.0
Name: t, dtype: float64
1
2
3
4
5
6
# 通过位置访问多个元素
import pandas as pd
a = pd.Series([9.1, 9.5, 10.0, 11.2], index=['a', 'b', 'c', 'd'], name='t')
print(a.iloc[0])
print('----------')
print(a.iloc[[0, 1]])
9.1
----------
a    9.1
b    9.5
Name: t, dtype: float64
1
2
3
4
5
6
# 获取数据的原始np.ndarray对象
import pandas as pd
a = pd.Series([9.1, 9.5, 10.0, 11.2], index=['a', 'b', 'c', 'd'], name='t')
b = a.values
print(b)
print(type(b))
[ 9.1  9.5 10.  11.2]
<class 'numpy.ndarray'>

4.2.5 pd.Series对象的常用方法

1
2
3
4
5
6
7
# 删除缺测值NaN
import numpy as np
import pandas as pd
a = pd.Series([9.1, np.nan, 10.0, 11.2], index=['a', 'b', 'c', 'd'], name='t')
print(a)
b = a.dropna()
print(b)
a     9.1
b     NaN
c    10.0
d    11.2
Name: t, dtype: float64
a     9.1
c    10.0
d    11.2
Name: t, dtype: float64
1
2
3
4
5
6
7
# 根据站号索引分组计算平均值
import pandas as pd
a = pd.Series([9.1, 9.5, 10.0, 11.2],
index=['sta1', 'sta2', 'sta1', 'sta2'],
name='t')
b = a.groupby(level=0).mean()
print(b)
sta1     9.55
sta2    10.35
Name: t, dtype: float64
1
2
3
4
5
6
7
# 根据阈值分组计算平均值
import pandas as pd
a = pd.Series([9.1, 9.5, 10.0, 11.2],
index=['a', 'b', 'c', 'd'],
name='t')
b = a.groupby(a>9.9).mean()
print(b)
t
False     9.3
True     10.6
Name: t, dtype: float64
1
2
3
4
5
6
7
# 按规则映射元素
import pandas as pd
a = pd.Series([9.1, 9.5, 10.0, 11.2],
index=['a', 'b', 'c', 'd'],
name='t')
b = a.map(lambda x: x - 5)
print(b)
a    4.1
b    4.5
c    5.0
d    6.2
Name: t, dtype: float64
1
2
3
4
5
6
7
8
# 插值填充NaN
import numpy as np
import pandas as pd
a = pd.Series([8.0, np.nan, 10.0, 11.0],
index=['a', 'b', 'c', 'd'],
name='t')
b = a.interpolate(method='linear')
print(b)
a     8.0
b     9.0
c    10.0
d    11.0
Name: t, dtype: float64
1
2
3
4
5
6
7
8
# 指定值填充NaN
import numpy as np
import pandas as pd
a = pd.Series([8.0, np.nan, 10.0, 11.0],
index=['a', 'b', 'c', 'd'],
name='t')
b = a.fillna(value=999)
print(b)
a      8.0
b    999.0
c     10.0
d     11.0
Name: t, dtype: float64
1
2
3
4
5
6
7
8
9
10
# 利用相邻值填充NaN
import numpy as np
import pandas as pd
a = pd.Series([8.0, np.nan, 10.0, 11.0],
index=['a', 'b', 'c', 'd'],
name='t')
b = a.fillna(method='bfill')
print(b)
b = a.fillna(method='ffill')
print(b)
a     8.0
b    10.0
c    10.0
d    11.0
Name: t, dtype: float64
a     8.0
b     8.0
c    10.0
d    11.0
Name: t, dtype: float64
1
2
3
4
5
6
7
# 降采样
import pandas as pd
a = pd.Series([9.1, 9.5, 10.0, 11.2],
index=pd.to_datetime(['2020-02-19', '2020-02-20',
'2020-02-22', '2020-02-23']),
name='t')
print(a.resample('2D').max())
2020-02-19     9.5
2020-02-21    10.0
2020-02-23    11.2
Freq: 2D, Name: t, dtype: float64
1
2
3
4
5
6
7
8
# 升采样
import pandas as pd
a = pd.Series([9.1, 9.5, 10.0, 11.2],
index=pd.to_datetime(['2020-02-19', '2020-02-20',
'2020-02-22', '2020-02-23']),
name='t')
print(a.resample('1D').asfreq())
print(a.resample('1D').bfill())
2020-02-19     9.1
2020-02-20     9.5
2020-02-21     NaN
2020-02-22    10.0
2020-02-23    11.2
Freq: D, Name: t, dtype: float64
2020-02-19     9.1
2020-02-20     9.5
2020-02-21    10.0
2020-02-22    10.0
2020-02-23    11.2
Freq: D, Name: t, dtype: float64
1
2
3
4
5
6
7
8
# 按照指定顺序排序
import pandas as pd
a = pd.Series([9.1, 9.5, 10.0, 11.2],
index=['a', 'b', 'c', 'd'],
name='t')
print(a)
b = a.reindex(['d', 'b', 'a', 'c'])
print(b)
a     9.1
b     9.5
c    10.0
d    11.2
Name: t, dtype: float64
d    11.2
b     9.5
a     9.1
c    10.0
Name: t, dtype: float64
1
2
3
4
5
6
7
8
9
10
11
# 使用reindex_like()可以用一个序列的索引来对另一个序列进行排序
import pandas as pd
a = pd.Series([9.1, 9.5, 10.0, 11.2],
index=['a', 'b', 'c', 'd'],
name='t1')
b = pd.Series([0, 0, 0, 0],
index=['b', 'd', 'a', 'd'],
name='t2')
print(a)
c = a.reindex_like(b)
print(c)
a     9.1
b     9.5
c    10.0
d    11.2
Name: t1, dtype: float64
b     9.5
d    11.2
a     9.1
d    11.2
Name: t1, dtype: float64
1
2
3
4
5
6
7
# 重命名序列
import pandas as pd
a = pd.Series([9.1, 9.5, 10.0, 11.2],
index=['a', 'b', 'c', 'd'],
name='hello')
b = a.rename('t')
print(b)
a     9.1
b     9.5
c    10.0
d    11.2
Name: t, dtype: float64
1
2
3
4
5
6
# 滑动窗口计算
import pandas as pd
a = pd.Series([9.1, 9.5, 10.0, 11.2, 12.1, 13.6, 14.5, 10.1],
name='t')
b = a.rolling(3).mean()
print(b)
0          NaN
1          NaN
2     9.533333
3    10.233333
4    11.100000
5    12.300000
6    13.400000
7    12.733333
Name: t, dtype: float64
1
2
3
4
5
6
# 中央平均的滑动窗口计算
import pandas as pd
a = pd.Series([9.1, 9.5, 10.0, 11.2, 12.1, 13.6, 14.5, 10.1],
name='t')
b = a.rolling(3, center=True).mean()
print(b)
0          NaN
1     9.533333
2    10.233333
3    11.100000
4    12.300000
5    13.400000
6    12.733333
7          NaN
Name: t, dtype: float64
1
2
3
4
5
6
7
8
# 平移数据
import pandas as pd
a = pd.Series([9.1, 9.5, 10.0, 11.2],
index=['a', 'b', 'c', 'd'],
name='t')
print(a)
b = a.shift(3)
print(b)
a     9.1
b     9.5
c    10.0
d    11.2
Name: t, dtype: float64
a    NaN
b    NaN
c    NaN
d    9.1
Name: t, dtype: float64
1
2
3
4
5
6
7
8
9
10
# 按照索引排序
import pandas as pd
a = pd.Series([9.1, 9.5, 10.0, 11.2],
index=['b', 'd', 'c', 'a'],
name='t')
print(a)
b = a.sort_index()
print(b)
c = a.sort_index(ascending=False)
print(c)
b     9.1
d     9.5
c    10.0
a    11.2
Name: t, dtype: float64
a    11.2
b     9.1
c    10.0
d     9.5
Name: t, dtype: float64
d     9.5
c    10.0
b     9.1
a    11.2
Name: t, dtype: float64
1
2
3
4
5
6
7
8
9
10
# 按照数据排序
import pandas as pd
a = pd.Series([9.5, 9.1, 11.2, 10.0],
index=['a', 'b', 'c', 'd'],
name='t')
print(a)
b = a.sort_values()
print(b)
c = a.sort_values(ascending=False)
print(c)
a     9.5
b     9.1
c    11.2
d    10.0
Name: t, dtype: float64
b     9.1
a     9.5
d    10.0
c    11.2
Name: t, dtype: float64
c    11.2
d    10.0
a     9.5
b     9.1
Name: t, dtype: float64
1
2
3
4
5
6
7
8
# 获取最大值/最小值
import numpy as np
import pandas as pd
a = pd.Series([9.5, np.nan, 11.2, 10.0],
index=['a', 'b', 'c', 'd'],
name='t')
print(a.max())
print(a.min())
11.2
9.5
1
2
3
4
5
6
7
8
# 获取最大值/最小值对应的位置
import numpy as np
import pandas as pd
a = pd.Series([9.5, np.nan, 11.2, 10.0],
index=['a', 'b', 'c', 'd'],
name='t')
print(a.argmax())
print(a.argmin())
2
0
1
2
3
4
5
6
7
8
# 获取最大值/最小值对应的标签索引
import numpy as np
import pandas as pd
a = pd.Series([9.5, np.nan, 11.2, 10.0],
index=['a', 'b', 'c', 'd'],
name='t')
print(a.idxmax())
print(a.idxmin())
c
a
1
2
3
4
5
6
7
8
# 计算标准差/无偏方差
import numpy as np
import pandas as pd
a = pd.Series([9.5, np.nan, 11.2, 10.0],
index=['a', 'b', 'c', 'd'],
name='t')
print(a.std())
print(a.var())
0.87368949480541
0.7633333333333326
1
2
3
4
5
6
7
8
9
# 计算协方差
import pandas as pd
a = pd.Series([9.5, 10.1, 11.2, 10.0],
index=['a', 'b', 'c', 'd'],
name='t1')
b = pd.Series([8.5, 9.1, 12.1, 10.5],
index=['a', 'b', 'c', 'd'],
name='t2')
print(a.cov(b))
1.046666666666666
1
2
3
4
5
6
7
# 序列求和/求均值
import pandas as pd
a = pd.Series([9.5, 10.1, 11.2, 10.0],
index=['a', 'b', 'c', 'd'],
name='t')
print(a.sum())
print(a.mean())
40.8
10.2
1
2
3
4
5
6
# 计算绝对值
import pandas as pd
a = pd.Series([-9.5, 10.1, -11.2, 10.0],
index=['a', 'b', 'c', 'd'],
name='t')
print(a.abs())
a     9.5
b    10.1
c    11.2
d    10.0
Name: t, dtype: float64
1
2
3
4
5
6
# 保存为CSV文件
import pandas as pd
a = pd.Series([-9.5, 10.1, -11.2, 10.0],
index=['a', 'b', 'c', 'd'],
name='t')
a.to_csv('series_with_index.csv')
1
2
3
4
5
6
# 忽略元素名字和索引,只保存数据
import pandas as pd
a = pd.Series([-9.5, 10.1, -11.2, 10.0],
index=['a', 'b', 'c', 'd'],
name='t')
a.to_csv('series_no_index.csv', index=False, header=False)
1
2
3
4
5
6
7
8
# 转换为列表对象
import pandas as pd
a = pd.Series([-9.5, 10.1, -11.2, 10.0],
index=['a', 'b', 'c', 'd'],
name='t')
b = a.to_list()
print(b)
print(type(b))
[-9.5, 10.1, -11.2, 10.0]
<class 'list'>
1
2
3
4
5
6
7
8
9
# 转换数据类型
import numpy
import pandas as pd
a = pd.Series([-9.5, 10.1, -11.2, 10.0],
index=['a', 'b', 'c', 'd'],
name='t')
print(a)
b = a.astype(np.int32)
print(b)
a    -9.5
b    10.1
c   -11.2
d    10.0
Name: t, dtype: float64
a    -9
b    10
c   -11
d    10
Name: t, dtype: int32

4.3 pd.DataFrame——数据框

4.3.1 创建数据框

1
2
3
4
5
6
7
# 创建一个简单的数据框
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83]]
)
print(a)
      0    1     2
0  21.7  983  0.64
1  19.2  991  0.75
2  13.4  973  0.83
1
2
3
4
5
6
7
8
9
# 为数据框添加行列索引
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
print(a)
       t    p    rh
s1  21.7  983  0.64
s2  19.2  991  0.75
s3  13.4  973  0.83
1
2
3
4
5
6
7
8
9
10
11
# 抽取其中的序列对象
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
b = a['t']
print(b)
print(type(b))
s1    21.7
s2    19.2
s3    13.4
Name: t, dtype: float64
<class 'pandas.core.series.Series'>
1
2
3
4
5
6
7
8
9
# 包含不同数据类型的数据框
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
print(a.dtypes)
t     float64
p       int64
rh    float64
dtype: object

4.3.2 pd.DataFrame的时间索引

1
2
3
4
5
6
7
8
9
# 带有时间索引的数据框
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83]],
index=pd.to_datetime(['2020-02-19', '2020-02-20', '2020-02-22']),
columns=['t', 'p', 'rh']
)
print(a)
               t    p    rh
2020-02-19  21.7  983  0.64
2020-02-20  19.2  991  0.75
2020-02-22  13.4  973  0.83

4.3.3 读取CSV文件

1
2
3
4
# 读取带有索引和列名的CSV文件
import pandas as pd
a = pd.read_csv('/home/mw/input/pythonbook9857/pandas_read.csv', index_col=0)
print(a)
        t      p    rh
sta                   
s1   21.7  983.0  0.64
s2   19.2  991.0  0.75
s3   13.4  973.0  0.83
NaN   NaN    NaN   NaN
NaN   NaN    NaN   NaN
NaN   NaN    NaN   NaN
NaN   NaN    NaN   NaN
NaN   NaN    NaN   NaN
NaN   NaN    NaN   NaN
NaN   NaN    NaN   NaN
NaN   NaN    NaN   NaN
NaN   NaN    NaN   NaN
NaN   NaN    NaN   NaN
NaN   NaN    NaN   NaN
NaN   NaN    NaN   NaN
1
2
3
4
# 以其他列为索引
import pandas as pd
a = pd.read_csv('/home/mw/input/pythonbook9857/pandas_read.csv', index_col=2)
print(a)
       sta     t    rh
p                     
983.0   s1  21.7  0.64
991.0   s2  19.2  0.75
973.0   s3  13.4  0.83
NaN    NaN   NaN   NaN
NaN    NaN   NaN   NaN
NaN    NaN   NaN   NaN
NaN    NaN   NaN   NaN
NaN    NaN   NaN   NaN
NaN    NaN   NaN   NaN
NaN    NaN   NaN   NaN
NaN    NaN   NaN   NaN
NaN    NaN   NaN   NaN
NaN    NaN   NaN   NaN
NaN    NaN   NaN   NaN
NaN    NaN   NaN   NaN
1
2
3
4
5
6
7
8
9
# 读取只带有索引或列名的CSV文件
import pandas as pd
a = pd.read_csv('/home/mw/input/pythonbook7300/pandas_read_noheader.csv', header=None, index_col=0)
print(a)
import pandas as pd
a = pd.read_csv('/home/mw/input/pythonbook7300/pandas_read_noheader.csv',
names=['sta', 't', 'p', 'rh'],
index_col=0)
print(a)
       1    2     3
0                  
s1  21.7  983  0.64
s2  19.2  991  0.75
s3  13.4  973  0.83
        t    p    rh
sta                 
s1   21.7  983  0.64
s2   19.2  991  0.75
s3   13.4  973  0.83
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# 读取带有时间列的CSV文件
import pandas as pd
a = pd.read_csv('/home/mw/input/pythonbook7300/pandas_read_date.csv', parse_dates=[0])
print(a)
print('----------')
print(a.dtypes)
# 直接指定time列为索引
import pandas as pd
a = pd.read_csv('/home/mw/input/pythonbook7300/pandas_read_date.csv', parse_dates=[0], index_col=0)
print(a)
# 当时间列为多列时
import pandas as pd
a = pd.read_csv('/home/mw/input/pythonbook7300/pandas_read_datetime.csv',
parse_dates=[[0, 1]],
index_col=0)
print(a)
        time     t    p    rh
0 2020-02-25  21.7  983  0.64
1 2020-02-26  19.2  991  0.75
2 2020-02-27  13.4  973  0.83
----------
time    datetime64[ns]
t              float64
p                int64
rh             float64
dtype: object
               t    p    rh
time                       
2020-02-25  21.7  983  0.64
2020-02-26  19.2  991  0.75
2020-02-27  13.4  973  0.83
               t    p    rh
date_time                  
2020-02-25  21.7  983  0.64
2020-02-26  19.2  991  0.75
2020-02-27  13.4  973  0.83

4.3.4 pd.DataFrame的算术运算

1
2
3
4
5
6
7
8
9
# pd.DataFrame与标量的算术运算
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
print(a+100)
        t     p      rh
s1  121.7  1083  100.64
s2  119.2  1091  100.75
s3  113.4  1073  100.83
1
2
3
4
5
6
7
8
9
10
# pd.DataFrame的赋值修改
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
a['rh'] = a['rh'] * 100
print(a)
       t    p    rh
s1  21.7  983  64.0
s2  19.2  991  75.0
s3  13.4  973  83.0
1
2
3
4
5
6
7
8
9
10
# pd.DataFrame与pd.Series的算术运算
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
b = pd.Series([10, -100, 0], index=['p', 't', 'rh'])
print(a + b)
       p    rh     t
s1   993  0.64 -78.3
s2  1001  0.75 -80.8
s3   983  0.83 -86.6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# pd.DataFrame与pd.DataFrame的算术运算
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
b = pd.DataFrame([[20, -14, -0.25],
[10, -5, -0.14],
[17, -3, -0.33]],
index=['s2', 's1', 's3'],
columns=['p', 't', 'rh']
)
print(a + b)
       p   rh     t
s1   993  0.5  16.7
s2  1011  0.5   5.2
s3   990  0.5  10.4
1
2
3
4
5
6
7
8
9
10
# 按数据条件提取满足条件的行
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
b = a[a['t']<20]
print(b)
       t    p    rh
s2  19.2  991  0.75
s3  13.4  973  0.83
1
2
3
4
5
6
7
8
9
10
11
12
# 按多个数据条件提取满足条件的行
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
b = a[(a['t']<20)&(a['rh']>0.8)]
print(b)
b = a[(a['t']<19)|(a['rh']<0.7)]
print(b)
       t    p    rh
s3  13.4  973  0.83
       t    p    rh
s1  21.7  983  0.64
s3  13.4  973  0.83
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# 按照时间索引条件提取
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83]],
index=pd.to_datetime(['2020-02-19', '2020-02-20', '2020-02-22']),
columns=['t', 'p', 'rh']
)
b = a[a.index.day == 19]
print(b)
b = a[a.index.month == 2]
print(b)
b = a[a.index.year == 2020]
print(b)
               t    p    rh
2020-02-19  21.7  983  0.64
               t    p    rh
2020-02-19  21.7  983  0.64
2020-02-20  19.2  991  0.75
2020-02-22  13.4  973  0.83
               t    p    rh
2020-02-19  21.7  983  0.64
2020-02-20  19.2  991  0.75
2020-02-22  13.4  973  0.83

4.3.6 pd.DataFrame的常用属性

1
2
3
4
5
6
7
8
9
10
11
# 查看常用属性
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
print(a.dtypes)
print(a.ndim)
print(a.shape)
t     float64
p       int64
rh    float64
dtype: object
2
(3, 3)
1
2
3
4
5
6
7
8
9
# 通过行、列标签访问单个元素
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
print(a.at['s1', 't'])
21.7
1
2
3
4
5
6
7
8
9
# 通过行、列位置访问单个元素
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
print(a.iat[0, 0])
21.7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# 通过行、列标签或布尔序列访问多个元素
# 按标签访问行
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
print(a.loc['s1']) # Series
print('----------')
print(a.loc[['s1']]) # DataFrame
print('*-*-*-*-*-')
print(a.loc[['s1', 's3']]) # DataFrame
t      21.70
p     983.00
rh      0.64
Name: s1, dtype: float64
----------
       t    p    rh
s1  21.7  983  0.64
*-*-*-*-*-
       t    p    rh
s1  21.7  983  0.64
s3  13.4  973  0.83
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# 通过行、列标签或布尔序列访问多个元素
# 按标签访问行
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
print(a.loc[:, 't']) # Series
print('----------')
print(a.loc[:, ['t']]) # DataFrame
print('*-*-*-*-*-')
print(a.loc[:, ['t', 'rh']]) # DataFrame
s1    21.7
s2    19.2
s3    13.4
Name: t, dtype: float64
----------
       t
s1  21.7
s2  19.2
s3  13.4
*-*-*-*-*-
       t    rh
s1  21.7  0.64
s2  19.2  0.75
s3  13.4  0.83
1
2
3
4
5
6
7
8
9
10
11
12
# 通过行、列标签或布尔序列访问多个元素
# 按布尔序列访问行、列
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
print(a.loc[[True, False, True]]) # 行
print('----------')
print(a.loc[:, [True, False, True]]) # 列
       t    p    rh
s1  21.7  983  0.64
s3  13.4  973  0.83
----------
       t    rh
s1  21.7  0.64
s2  19.2  0.75
s3  13.4  0.83
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# 通过行、列位置访问多个元素
# 按位置访问行
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
print(a.iloc[0]) # Series
print('----------')
print(a.iloc[[0]]) # DataFrame
print('*-*-*-*-*-')
print(a.iloc[[0, 2]]) # DataFrame
t      21.70
p     983.00
rh      0.64
Name: s1, dtype: float64
----------
       t    p    rh
s1  21.7  983  0.64
*-*-*-*-*-
       t    p    rh
s1  21.7  983  0.64
s3  13.4  973  0.83
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# 通过行、列位置访问多个元素
# 按位置访问列
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
print(a.iloc[:, 0]) # Series
print('----------')
print(a.iloc[:, [0]]) # DataFrame
print('*-*-*-*-*-')
print(a.iloc[:, [0, 2]]) # DataFrame
s1    21.7
s2    19.2
s3    13.4
Name: t, dtype: float64
----------
       t
s1  21.7
s2  19.2
s3  13.4
*-*-*-*-*-
       t    rh
s1  21.7  0.64
s2  19.2  0.75
s3  13.4  0.83
1
2
3
4
5
6
7
8
9
10
11
# 获取数据原始的np.ndarray对象
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
b = a.values
print(b)
print(type(b))
[[2.17e+01 9.83e+02 6.40e-01]
 [1.92e+01 9.91e+02 7.50e-01]
 [1.34e+01 9.73e+02 8.30e-01]]
<class 'numpy.ndarray'>

4.3.7 pd.DataFrame的常用方法

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# 删除包含NaN的行或列
import numpy as np
import pandas as pd
a = pd.DataFrame([[np.nan, 983, 0.64],
[np.nan, np.nan, 0.75],
[13.4, 973, 0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
print(a)
print('------')
b = a.dropna() # 删除包含NaN的行
print(b)
print('-*-*-*-*-')
c = a.dropna(axis=1) # 删除包含NaN的列
print(c)
       t      p    rh
s1   NaN  983.0  0.64
s2   NaN    NaN  0.75
s3  13.4  973.0  0.83
------
       t      p    rh
s3  13.4  973.0  0.83
-*-*-*-*-
      rh
s1  0.64
s2  0.75
s3  0.83
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# 仅删除全为NaN的行或列
import numpy as np
import pandas as pd
a = pd.DataFrame([[np.nan, 983, 0.64],
[np.nan, np.nan, np.nan],
[np.nan, 973, 0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
print(a)
print('------')
b = a.dropna(how='all') # 删除全为NaN的行
print(b)
print('-*-*-*-*-')
c = a.dropna(axis=1, how='all') # 删除全为NaN的列
print(c)
     t      p    rh
s1 NaN  983.0  0.64
s2 NaN    NaN   NaN
s3 NaN  973.0  0.83
------
     t      p    rh
s1 NaN  983.0  0.64
s3 NaN  973.0  0.83
-*-*-*-*-
        p    rh
s1  983.0  0.64
s2    NaN   NaN
s3  973.0  0.83
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# 通过原始数据分组
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64, 'a'],
[19.2, 991, 0.75, 'b'],
[13.4, 973, 0.83, 'a'],
[13.4, 973, 0.83, 'b']],
index=['s1', 's2', 's3', 's2'],
columns=['t', 'p', 'rh', 'kind']
)
print(a)
print('--------')
print(a.groupby(level=0).sum())
print('-*-*-*-*-*')
print(a.groupby(level=0).size())
       t    p    rh kind
s1  21.7  983  0.64    a
s2  19.2  991  0.75    b
s3  13.4  973  0.83    a
s2  13.4  973  0.83    b
--------
       t     p    rh
s1  21.7   983  0.64
s2  32.6  1964  1.58
s3  13.4   973  0.83
-*-*-*-*-*
s1    1
s2    2
s3    1
dtype: int64
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# 指定列名通过数据分组
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64, 'a'],
[19.2, 991, 0.75, 'b'],
[13.4, 973, 0.83, 'a'],
[13.4, 973, 0.83, 'b']],
index=['s1', 's2', 's3', 's2'],
columns=['t', 'p', 'rh', 'kind']
)
print(a)
print('--------')
print(a.groupby(by='kind').sum())
print('-*-*-*-*-*')
print(a.groupby(by='kind').size())
       t    p    rh kind
s1  21.7  983  0.64    a
s2  19.2  991  0.75    b
s3  13.4  973  0.83    a
s2  13.4  973  0.83    b
--------
         t     p    rh
kind                  
a     35.1  1956  1.47
b     32.6  1964  1.58
-*-*-*-*-*
kind
a    2
b    2
dtype: int64
1
2
3
4
5
6
7
8
9
10
11
12
# 通过函数分组
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64, 'a'],
[19.2, 991, 0.75, 'b'],
[13.4, 973, 0.83, 'a'],
[13.4, 973, 0.83, 'b']],
index=['s1', 's2', 's3', 's2'],
columns=['t', 'p', 'rh', 'kind']
)
print(a)
print('--------')
print(a.groupby(by=lambda x:x=='s2').sum())
       t    p    rh kind
s1  21.7  983  0.64    a
s2  19.2  991  0.75    b
s3  13.4  973  0.83    a
s2  13.4  973  0.83    b
--------
          t     p    rh
False  35.1  1956  1.47
True   32.6  1964  1.58
1
2
3
4
5
6
7
8
9
10
11
# 对时间戳类型的索引进行分组
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83]],
index=pd.to_datetime(['2020-02-19', '2020-02-20', '2020-03-22']),
columns=['t', 'p', 'rh']
)
print(a)
print('--------')
print(a.groupby(by=lambda x:x.month).sum())
               t    p    rh
2020-02-19  21.7  983  0.64
2020-02-20  19.2  991  0.75
2020-03-22  13.4  973  0.83
--------
      t     p    rh
2  40.9  1974  1.39
3  13.4   973  0.83
1
2
3
4
5
6
7
8
9
10
11
# 按规则映射分组
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
print(a)
print('--------')
print(a.applymap(lambda x:x/10))
       t    p    rh
s1  21.7  983  0.64
s2  19.2  991  0.75
s3  13.4  973  0.83
--------
       t     p     rh
s1  2.17  98.3  0.064
s2  1.92  99.1  0.075
s3  1.34  97.3  0.083
1
2
3
4
5
6
7
8
9
10
11
# 重采样
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83]],
index=pd.to_datetime(['2020-02-19', '2020-02-20', '2020-02-22']),
columns=['t', 'p', 'rh']
)
print(a)
print('--------')
print(a.resample('2D').mean())
               t    p    rh
2020-02-19  21.7  983  0.64
2020-02-20  19.2  991  0.75
2020-02-22  13.4  973  0.83
--------
                t      p     rh
2020-02-19  20.45  987.0  0.695
2020-02-21  13.40  973.0  0.830
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# 填充NaN
# 通过插值的方式填充
import numpy as np
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, np.nan, 0.75],
[13.4, 973, 0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
print(a)
print('--------')
print(a.interpolate(method='linear')) # 列向线性插值
print('*-*-*-*-*-')
print(a.interpolate(method='linear', axis=1)) # 行向线性插值
       t      p    rh
s1  21.7  983.0  0.64
s2  19.2    NaN  0.75
s3  13.4  973.0  0.83
--------
       t      p    rh
s1  21.7  983.0  0.64
s2  19.2  978.0  0.75
s3  13.4  973.0  0.83
*-*-*-*-*-
       t        p    rh
s1  21.7  983.000  0.64
s2  19.2    9.975  0.75
s3  13.4  973.000  0.83
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# 用指定值填充或根据行或列的前后值填充
import numpy as np
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, np.nan, 0.75],
[13.4, 973, 0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
print(a)
print('--------')
print(a.fillna(9999)) # 用值填充
print('*-*-*-*-*-')
print(a.fillna(method='bfill')) # 列向后值填充
print('**********')
print(a.fillna(method='ffill')) # 列向前值填充
       t      p    rh
s1  21.7  983.0  0.64
s2  19.2    NaN  0.75
s3  13.4  973.0  0.83
--------
       t       p    rh
s1  21.7   983.0  0.64
s2  19.2  9999.0  0.75
s3  13.4   973.0  0.83
*-*-*-*-*-
       t      p    rh
s1  21.7  983.0  0.64
s2  19.2  973.0  0.75
s3  13.4  973.0  0.83
**********
       t      p    rh
s1  21.7  983.0  0.64
s2  19.2  983.0  0.75
s3  13.4  973.0  0.83
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# 根据行的前后值进行填充
import numpy as np
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, np.nan, 0.75],
[13.4, 973, 0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
print(a)
print('--------')
print(a.fillna(method='bfill', axis=1)) # 行向后值填充
print('*-*-*-*-*-')
print(a.fillna(method='ffill', axis=1)) # 行向前值填充
       t      p    rh
s1  21.7  983.0  0.64
s2  19.2    NaN  0.75
s3  13.4  973.0  0.83
--------
       t       p    rh
s1  21.7  983.00  0.64
s2  19.2    0.75  0.75
s3  13.4  973.00  0.83
*-*-*-*-*-
       t      p    rh
s1  21.7  983.0  0.64
s2  19.2   19.2  0.75
s3  13.4  973.0  0.83
1
2
3
4
5
6
7
8
9
10
11
12
13
# 按照指定序列/索引和列名排序
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
print(a)
print('--------')
print(a.reindex(['s2', 's3', 's1']))
print('*-*-*-*-*-')
print(a.reindex(index=['s2', 's3', 's1'], columns=['rh', 't', 'p']))
       t    p    rh
s1  21.7  983  0.64
s2  19.2  991  0.75
s3  13.4  973  0.83
--------
       t    p    rh
s2  19.2  991  0.75
s3  13.4  973  0.83
s1  21.7  983  0.64
*-*-*-*-*-
      rh     t    p
s2  0.75  19.2  991
s3  0.83  13.4  973
s1  0.64  21.7  983
1
2
3
4
5
6
7
8
9
10
11
12
# 按照已有DataFrame对象的索引和列名排序
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
b = pd.DataFrame(data=None, index=['s2', 's1', 's3'], columns=['p', 't', 'rh'])
print(a)
print('--------')
print(a.reindex_like(b))
       t    p    rh
s1  21.7  983  0.64
s2  19.2  991  0.75
s3  13.4  973  0.83
--------
      p     t    rh
s2  991  19.2  0.75
s1  983  21.7  0.64
s3  973  13.4  0.83
1
2
3
4
5
6
7
8
9
10
11
12
# 重置索引
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83],
[12.4, 963, 0.73]],
index=['s1', 's2', 's3', 's4'],
columns=['t', 'p', 'rh']
)
print(a)
print('--------')
print(a.reset_index())
       t    p    rh
s1  21.7  983  0.64
s2  19.2  991  0.75
s3  13.4  973  0.83
s4  12.4  963  0.73
--------
  index     t    p    rh
0    s1  21.7  983  0.64
1    s2  19.2  991  0.75
2    s3  13.4  973  0.83
3    s4  12.4  963  0.73
1
2
3
4
5
6
7
8
9
10
11
12
13
# 滑动窗口计算
# 对普通序列进行滑动计算
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83],
[12.4, 963, 0.73]],
index=['s1', 's2', 's3', 's4'],
columns=['t', 'p', 'rh']
)
print(a)
print('--------')
print(a.rolling(2).mean()) #此处滑动窗口值为2
       t    p    rh
s1  21.7  983  0.64
s2  19.2  991  0.75
s3  13.4  973  0.83
s4  12.4  963  0.73
--------
        t      p     rh
s1    NaN    NaN    NaN
s2  20.45  987.0  0.695
s3  16.30  982.0  0.790
s4  12.90  968.0  0.780
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# 滑动窗口计算
# 对时间序列进行滑动计算
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83],
[12.4, 963, 0.73]],
index=pd.to_datetime(['2020-02-19', '2020-02-20',
'2020-02-21', '2020-02-22']),
columns=['t', 'p', 'rh']
)
print(a)
print('--------')
print(a.rolling('2D').mean())
               t    p    rh
2020-02-19  21.7  983  0.64
2020-02-20  19.2  991  0.75
2020-02-21  13.4  973  0.83
2020-02-22  12.4  963  0.73
--------
                t      p     rh
2020-02-19  21.70  983.0  0.640
2020-02-20  20.45  987.0  0.695
2020-02-21  16.30  982.0  0.790
2020-02-22  12.90  968.0  0.780
1
2
3
4
5
6
7
8
9
10
11
12
13
# 平移数据
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
print(a)
print('--------')
print(a.shift(1)) # 列方向平移
print('*-*-*-*-*-')
print(a.shift(1, axis=1)) # 行方向平移
       t    p    rh
s1  21.7  983  0.64
s2  19.2  991  0.75
s3  13.4  973  0.83
--------
       t      p    rh
s1   NaN    NaN   NaN
s2  21.7  983.0  0.64
s3  19.2  991.0  0.75
*-*-*-*-*-
     t     p   rh
s1 NaN  21.7  983
s2 NaN  19.2  991
s3 NaN  13.4  973
1
2
3
4
5
6
7
8
9
10
11
12
13
# 按照索引排序
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83]],
index=['s2', 's1', 's3'],
columns=['t', 'p', 'rh']
)
print(a)
print('--------')
print(a.sort_index()) # 升序排列
print('*-*-*-*-*-')
print(a.sort_index(ascending=False)) # 降序排列
       t    p    rh
s2  21.7  983  0.64
s1  19.2  991  0.75
s3  13.4  973  0.83
--------
       t    p    rh
s1  19.2  991  0.75
s2  21.7  983  0.64
s3  13.4  973  0.83
*-*-*-*-*-
       t    p    rh
s3  13.4  973  0.83
s2  21.7  983  0.64
s1  19.2  991  0.75
1
2
3
4
5
6
7
8
9
10
11
12
13
# 按照数据排序
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
print(a)
print('--------')
print(a.sort_values('p')) # 按p列升序排列
print('*-*-*-*-*-')
print(a.sort_values('p', ascending=False)) # 按p列降序排列
       t    p    rh
s1  21.7  983  0.64
s2  19.2  991  0.75
s3  13.4  973  0.83
--------
       t    p    rh
s3  13.4  973  0.83
s1  21.7  983  0.64
s2  19.2  991  0.75
*-*-*-*-*-
       t    p    rh
s2  19.2  991  0.75
s1  21.7  983  0.64
s3  13.4  973  0.83
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# 获取最大值/最小值
# 使用max()/min()可以获取列的最大值/最小值
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
print(a)
print('--------')
print(a.max()) # 列最大值
print('*-*-*-*-*-')
print(a.min()) # 列最小值
       t    p    rh
s1  21.7  983  0.64
s2  19.2  991  0.75
s3  13.4  973  0.83
--------
t      21.70
p     991.00
rh      0.83
dtype: float64
*-*-*-*-*-
t      13.40
p     973.00
rh      0.64
dtype: float64
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# 获取最大值/最小值
# 使用max()/min()也可以获取行的最大值/最小值
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
print(a)
print('--------')
print(a.max(axis=1)) # 行最大值
print('*-*-*-*-*-')
print(a.min(axis=1)) # 行最小值
       t    p    rh
s1  21.7  983  0.64
s2  19.2  991  0.75
s3  13.4  973  0.83
--------
s1    983.0
s2    991.0
s3    973.0
dtype: float64
*-*-*-*-*-
s1    0.64
s2    0.75
s3    0.83
dtype: float64
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# 获取最大值/最小值对应的标签
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
print(a)
print('--------')
print(a.idxmin())
print('*-*-*-*-*-')
print(a.idxmax())
print('~~~~~~')
print(a.idxmax(axis=1))
       t    p    rh
s1  21.7  983  0.64
s2  19.2  991  0.75
s3  13.4  973  0.83
--------
t     s3
p     s3
rh    s1
dtype: object
*-*-*-*-*-
t     s1
p     s2
rh    s3
dtype: object
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
    s1    p
s2 p
s3 p
dtype: object



```python
# 计算标准差/无偏方差
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
print(a)
print('--------')
print(a.std()) # 按列计算标准差
print('*-*-*-*-*-')
print(a.var()) # 按列计算无偏方差
print('~~~~~~')
print(a.std(axis=1))
```

t p rh
s1 21.7 983 0.64
s2 19.2 991 0.75
s3 13.4 973 0.83
--------
t 4.257934
p 9.018500
rh 0.095394
dtype: float64
*-*-*-*-*-
t 18.130000
p 81.333333
rh 0.009100
dtype: float64
s1 561.185113 s2 566.470168 s3 557.689381 dtype: float64
1
2
3
4
5
6
7
8
9
10
11
# 计算相关系数
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
print(a)
print('--------')
print(a.corr())
       t    p    rh
s1  21.7  983  0.64
s2  19.2  991  0.75
s3  13.4  973  0.83
--------
           t         p        rh
t   1.000000  0.726559 -0.950315
p   0.726559  1.000000 -0.476572
rh -0.950315 -0.476572  1.000000
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# 使用corrwith()计算pd.DataFram与另一个pd.Series或pd.DataFrame之间的相关系数
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
b = pd.DataFrame([[21.5, 988, 0.62],
[19.4, 996, 0.74],
[13.2, 973, 0.85]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
print(a)
print('--------')
print(a.corrwith(b))
       t    p    rh
s1  21.7  983  0.64
s2  19.2  991  0.75
s3  13.4  973  0.83
--------
t     0.998639
p     0.993970
rh    0.997835
dtype: float64
1
2
3
4
5
6
7
8
9
10
11
# 计算协方差
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
print(a)
print('--------')
print(a.cov())
       t    p    rh
s1  21.7  983  0.64
s2  19.2  991  0.75
s3  13.4  973  0.83
--------
         t          p      rh
t   18.130  27.900000 -0.3860
p   27.900  81.333333 -0.4100
rh  -0.386  -0.410000  0.0091
1
2
3
4
5
6
7
8
9
10
11
12
13
# 序列求和/求均值
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
print(a)
print('--------')
print(a.sum())
print('-*-*-*-*-')
print(a.mean())
       t    p    rh
s1  21.7  983  0.64
s2  19.2  991  0.75
s3  13.4  973  0.83
--------
t       54.30
p     2947.00
rh       2.22
dtype: float64
-*-*-*-*-
t      18.100000
p     982.333333
rh      0.740000
dtype: float64
1
2
3
4
5
6
7
8
9
10
11
# 获取绝对值
import pandas as pd
a = pd.DataFrame([[-21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, -973, -0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
print(a)
print('--------')
print(a.abs())
       t    p    rh
s1 -21.7  983  0.64
s2  19.2  991  0.75
s3  13.4 -973 -0.83
--------
       t    p    rh
s1  21.7  983  0.64
s2  19.2  991  0.75
s3  13.4  973  0.83
1
2
3
4
5
6
7
8
9
10
# 保存为CSV文件
import pandas as pd
a = pd.DataFrame([[-21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, -973, -0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
print(a)
a.to_csv('dataframe_to.csv')
       t    p    rh
s1 -21.7  983  0.64
s2  19.2  991  0.75
s3  13.4 -973 -0.83
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# 转换数据类型
# 统一转换
import numpy as np
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
print(a.dtypes)
print('--------')
b = a.astype(np.float32)
print(b.dtypes)
t     float64
p       int64
rh    float64
dtype: object
--------
t     float32
p     float32
rh    float32
dtype: object
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# 转换数据类型
# 指定列转换
import numpy as np
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
[19.2, 991, 0.75],
[13.4, 973, 0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
print(a.dtypes)
print('--------')
b = a.astype({'t': np.float32, 'p': np.float64})
print(b.dtypes)
t     float64
p       int64
rh    float64
dtype: object
--------
t     float32
p     float64
rh    float64
dtype: object

4.4 pandas的常用函数

4.4.1 to_numeric()——将序列转换为数值类型

1
2
3
4
5
6
7
8
9
10
11
12
# 将非数值类型的序列转换为数值类型
import pandas as pd
a = pd.DataFrame([[21.7, 983, 0.64],
['19.2', 991, 0.75],
[13.4, 973, 0.83]],
index=['s1', 's2', 's3'],
columns=['t', 'p', 'rh']
)
print(a.dtypes)
print('--------')
a['t'] = pd.to_numeric(a['t'])
print(a.dtypes)
t      object
p       int64
rh    float64
dtype: object
--------
t     float64
p       int64
rh    float64
dtype: object

4.4.2 to_datetime()——将序列转换为时间戳类型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# 直接解析标准格式的时间字符串
import pandas as pd

a = pd.to_datetime(['20200228', '20200301'])
b = pd.to_datetime(['2020-02-28', '2020-03-01'])
c = pd.to_datetime(['2020/02/28', '2020/03/01'])
d = pd.to_datetime(['202002281200', '202002281200'])
e = pd.to_datetime(['2020-02-28 12:00', '2020-03-01 12:00'])
f = pd.to_datetime(['2020/02/28 12:00', '2020/03/01 12:00'])
g = pd.to_datetime(['20200228120050', '20200228120050'])
h = pd.to_datetime(['2020-02-28 12:00:50', '2020-03-01 12:00:50'])
i = pd.to_datetime(['2020/02/28 12:00:50', '2020/03/01 12:00:50'])
print(a)
print(b)
print(c)
print(d)
print(e)
print(f)
print(g)
print(h)
print(i)
DatetimeIndex(['2020-02-28', '2020-03-01'], dtype='datetime64[ns]', freq=None)
DatetimeIndex(['2020-02-28', '2020-03-01'], dtype='datetime64[ns]', freq=None)
DatetimeIndex(['2020-02-28', '2020-03-01'], dtype='datetime64[ns]', freq=None)
DatetimeIndex(['2020-02-28 12:00:00', '2020-02-28 12:00:00'], dtype='datetime64[ns]', freq=None)
DatetimeIndex(['2020-02-28 12:00:00', '2020-03-01 12:00:00'], dtype='datetime64[ns]', freq=None)
DatetimeIndex(['2020-02-28 12:00:00', '2020-03-01 12:00:00'], dtype='datetime64[ns]', freq=None)
DatetimeIndex(['2020-02-28 12:00:50', '2020-02-28 12:00:50'], dtype='datetime64[ns]', freq=None)
DatetimeIndex(['2020-02-28 12:00:50', '2020-03-01 12:00:50'], dtype='datetime64[ns]', freq=None)
DatetimeIndex(['2020-02-28 12:00:50', '2020-03-01 12:00:50'], dtype='datetime64[ns]', freq=None)
1
2
3
4
5
6
7
8
# 对于特殊格式的时间字符串的解析
import pandas as pd
a = pd.to_datetime(['2020年02月28日', '2020年03月28日'], format='%Y年%m月%d日')
b = pd.to_datetime(['202002(28)', '202003(28)'], format='%Y%m(%d)')
c = pd.to_datetime(['20200228(1200)', '20200328(1200)'], format='%Y%m%d(%H%M)')
print(a)
print(b)
print(c)
DatetimeIndex(['2020-02-28', '2020-03-28'], dtype='datetime64[ns]', freq=None)
DatetimeIndex(['2020-02-28', '2020-03-28'], dtype='datetime64[ns]', freq=None)
DatetimeIndex(['2020-02-28 12:00:00', '2020-03-28 12:00:00'], dtype='datetime64[ns]', freq=None)

4.4.3 to_timedelta()——将序列转换为时间差类型

1
2
3
4
5
6
# 时间差的概念
import pandas as pd
a = pd.to_timedelta(['1 day', '10 min', '20S'])
b = pd.to_timedelta([1, 3, 5], unit='D')
print(a)
print(b)
TimedeltaIndex(['1 days 00:00:00', '0 days 00:10:00', '0 days 00:00:20'], dtype='timedelta64[ns]', freq=None)
TimedeltaIndex(['1 days', '3 days', '5 days'], dtype='timedelta64[ns]', freq=None)

4.4.4 date_range()——生成时间序列

1
2
3
4
5
6
7
8
9
10
11
# 生成时间序列
import pandas as pd

a = pd.date_range(start='2020-03-01', periods=3, freq='3H')
b = pd.date_range(start='2020-03-01', end='2020-03-03', freq='D')
c = pd.date_range(start='2020-03-01', end='2020-03-03')

print(a)
print(b)
print(c)

DatetimeIndex(['2020-03-01 00:00:00', '2020-03-01 03:00:00',
               '2020-03-01 06:00:00'],
              dtype='datetime64[ns]', freq='3H')
DatetimeIndex(['2020-03-01', '2020-03-02', '2020-03-03'], dtype='datetime64[ns]', freq='D')
DatetimeIndex(['2020-03-01', '2020-03-02', '2020-03-03'], dtype='datetime64[ns]', freq='D')

4.4.5 merge()——按值连接两个pd.DataFrame

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# 按值连接两个pd.DataFrame
import pandas as pd
a = pd.DataFrame([['sunny', 983, 0.64],
['rain', 991, 0.75],
['fog', 973, 0.83],
['haze', 1001, 0.93]],
index=['d1', 'd2', 'd3', 'd4'],
columns=['weather', 'p', 'rh']
)
b = pd.DataFrame([['rain', '0121'],
['windy', '1123'],
['fog', '1234'],
['sunny', '2234']],
columns=['weather', 'code']
)
print(a)
print('------')
print(b)
# 左连接
c = pd.merge(left=a, right=b, left_on='weather', right_on='weather', how='left')
print(c)
# 右连接
c = pd.merge(left=a, right=b, left_on='weather', right_on='weather', how='right')
print(c)
# 内连接
c = pd.merge(left=a, right=b, left_on='weather', right_on='weather', how='inner')
print(c)
# 外连接
c = pd.merge(left=a, right=b, left_on='weather', right_on='weather', how='outer')
print(c)
   weather     p    rh
d1   sunny   983  0.64
d2    rain   991  0.75
d3     fog   973  0.83
d4    haze  1001  0.93
------
  weather  code
0    rain  0121
1   windy  1123
2     fog  1234
3   sunny  2234
  weather     p    rh  code
0   sunny   983  0.64  2234
1    rain   991  0.75  0121
2     fog   973  0.83  1234
3    haze  1001  0.93   NaN
  weather      p    rh  code
0    rain  991.0  0.75  0121
1   windy    NaN   NaN  1123
2     fog  973.0  0.83  1234
3   sunny  983.0  0.64  2234
  weather    p    rh  code
0   sunny  983  0.64  2234
1    rain  991  0.75  0121
2     fog  973  0.83  1234
  weather       p    rh  code
0   sunny   983.0  0.64  2234
1    rain   991.0  0.75  0121
2     fog   973.0  0.83  1234
3    haze  1001.0  0.93   NaN
4   windy     NaN   NaN  1123

4.4.6 concat()——合并多个pd.DataFrame

1
2
3
4
5
6
7
8
9
10
11
12
# 直接按照行或列合并多个pd.DataFrame
import pandas as pd
a = pd.DataFrame([['sunny', 983, 0.64],
['rain', 991, 0.75],
['fog', 973, 0.83],
['haze', 1001, 0.93]],
index=['d1', 'd2', 'd3', 'd4'],
columns=['weather', 'p', 'rh']
)
print(pd.concat([a, a]))
print('-----')
print(pd.concat([a, a], axis=1))
   weather     p    rh
d1   sunny   983  0.64
d2    rain   991  0.75
d3     fog   973  0.83
d4    haze  1001  0.93
d1   sunny   983  0.64
d2    rain   991  0.75
d3     fog   973  0.83
d4    haze  1001  0.93
-----
   weather     p    rh weather     p    rh
d1   sunny   983  0.64   sunny   983  0.64
d2    rain   991  0.75    rain   991  0.75
d3     fog   973  0.83     fog   973  0.83
d4    haze  1001  0.93    haze  1001  0.93