Pandas

2024-01-03 19:01:06

文章目录

pandas基本介绍
pandas 选择数据
pandas设置值
pandas处理丢失数据
pandas导入导出数据
pandas合并DataFrame
pandas:数据可视化

pandas基本介绍

import pandas as pd
import numpy as np

s = pd.Series([1,3,6,np.nan,44,1]) #numpy如果是针对列表的话，pandas更像是字典
s # 在这可以理解为一维pandas

0     1.0
1     3.0
2     6.0
3     NaN
4    44.0
5     1.0
dtype: float64

dates = pd.date_range('20160101',periods=6)  # 定义行索引
dates

DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
               '2016-01-05', '2016-01-06'],
              dtype='datetime64[ns]', freq='D')

df = pd.DataFrame(np.random.randn(6,4),index=dates)
df

	0	1	2	3
2016-01-01	0.310279	-0.113450	1.453515	0.893409
2016-01-02	0.511068	-0.088535	-1.751460	0.390180
2016-01-03	0.415210	0.352752	0.431860	0.225930
2016-01-04	0.649793	0.743668	1.250057	1.396353
2016-01-05	1.145737	0.338144	1.077738	0.856458
2016-01-06	0.037643	1.375382	1.560754	-0.435449

df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d']) # 常规形式创建DataFrame
df

	a	b	c	d
2016-01-01	1.117627	-0.796587	0.041202	-0.772693
2016-01-02	-0.987977	-1.525442	-0.684378	0.007355
2016-01-03	-0.255173	-1.444724	0.599456	1.050332
2016-01-04	-0.020769	-0.354652	-1.111232	1.217364
2016-01-05	-1.114441	-0.069303	0.473385	0.425665
2016-01-06	1.157257	-0.081045	0.973594	1.198853

df1 = pd.DataFrame(np.arange(12).reshape(3,4))
df1

	0	1	2	3
0	0	1	2	3
1	4	5	6	7
2	8	9	10	11

df2 = pd.DataFrame({'A':1.,
                    'B':pd.Timestamp('20230102'),
                    'C':pd.Series(1,index=list(range(4)),dtype='float32'),
                    'D':np.array([3]*4,dtype='int32'),
                    'E':pd.Categorical(['test','train','test','train']),
                    'F':'foo'})  # 字典方式创建DataFrame
df2

	A	B	C	D	E	F
0	1.0	2023-01-02	1.0	3	test	foo
1	1.0	2023-01-02	1.0	3	train	foo
2	1.0	2023-01-02	1.0	3	test	foo
3	1.0	2023-01-02	1.0	3	train	foo

df2.dtypes # 每列的数据类型

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

df2.index # 行索引

Int64Index([0, 1, 2, 3], dtype='int64')

df2.columns # 列索引

Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')

df2.values  # 输出值

array([[1.0, Timestamp('2023-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2023-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2023-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2023-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

df2.describe()  # 描述dataframe

	A	C	D
count	4.0	4.0	4.0
mean	1.0	1.0	3.0
std	0.0	0.0	0.0
min	1.0	1.0	3.0
25%	1.0	1.0	3.0
50%	1.0	1.0	3.0
75%	1.0	1.0	3.0
max	1.0	1.0	3.0

df2.T #转置

	0	1	2	3
A	1.0	1.0	1.0	1.0
B	2023-01-02 00:00:00	2023-01-02 00:00:00	2023-01-02 00:00:00	2023-01-02 00:00:00
C	1.0	1.0	1.0	1.0
D	3	3	3	3
E	test	train	test	train
F	foo	foo	foo	foo

df2.sort_index(axis=1,ascending=False)  #对列进行倒序排序

	F	E	D	C	B	A
0	foo	test	3	1.0	2023-01-02	1.0
1	foo	train	3	1.0	2023-01-02	1.0
2	foo	test	3	1.0	2023-01-02	1.0
3	foo	train	3	1.0	2023-01-02	1.0

df2.sort_index(axis=0,ascending=False)  #对行进行倒序排序

	A	B	C	D	E	F
3	1.0	2023-01-02	1.0	3	train	foo
2	1.0	2023-01-02	1.0	3	test	foo
1	1.0	2023-01-02	1.0	3	train	foo
0	1.0	2023-01-02	1.0	3	test	foo

df2.sort_values(by='E')  # 对值进行排列

	A	B	C	D	E	F
0	1.0	2023-01-02	1.0	3	test	foo
2	1.0	2023-01-02	1.0	3	test	foo
1	1.0	2023-01-02	1.0	3	train	foo
3	1.0	2023-01-02	1.0	3	train	foo

pandas 选择数据

import pandas as pd
import numpy as np

dates = pd.date_range('20240101',periods=6)
df = pd.DataFrame(np.arange(24).reshape(6,4),index=dates,columns=['A','B','C','D'])
df

	A	B	C	D
2024-01-01	0	1	2	3
2024-01-02	4	5	6	7
2024-01-03	8	9	10	11
2024-01-04	12	13	14	15
2024-01-05	16	17	18	19
2024-01-06	20	21	22	23

df['A'] #显示每一列

2024-01-01     0
2024-01-02     4
2024-01-03     8
2024-01-04    12
2024-01-05    16
2024-01-06    20
Freq: D, Name: A, dtype: int32

df.A  #显示每一列

2024-01-01     0
2024-01-02     4
2024-01-03     8
2024-01-04    12
2024-01-05    16
2024-01-06    20
Freq: D, Name: A, dtype: int32

 df[0:3]  #选取前三行

	A	B	C	D
2024-01-01	0	1	2	3
2024-01-02	4	5	6	7
2024-01-03	8	9	10	11

df['20240102':'20240105']

	A	B	C	D
2024-01-02	4	5	6	7
2024-01-03	8	9	10	11
2024-01-04	12	13	14	15
2024-01-05	16	17	18	19

select by label:loc 纯标签筛选

df.loc['20240103']  #以标签名义来选择，更具体一点

A     8
B     9
C    10
D    11
Name: 2024-01-03 00:00:00, dtype: int32

df.loc[:,['A','B']]  #保存所有行，选择A、B两列

	A	B
2024-01-01	0	1
2024-01-02	4	5
2024-01-03	8	9
2024-01-04	12	13
2024-01-05	16	17
2024-01-06	20	21

df.loc['20240102',['A','B']]

A    4
B    5
Name: 2024-01-02 00:00:00, dtype: int32

select by position:iloc 纯位置筛选

df

	A	B	C	D
2024-01-01	0	1	2	3
2024-01-02	4	5	6	7
2024-01-03	8	9	10	11
2024-01-04	12	13	14	15
2024-01-05	16	17	18	19
2024-01-06	20	21	22	23

df.iloc[3] # 选择第三行数据

A    12
B    13
C    14
D    15
Name: 2024-01-04 00:00:00, dtype: int32

df.iloc[3,1]  # 选择第三行第一位数据

df.iloc[3:5,1:3]

	B	C
2024-01-04	13	14
2024-01-05	17	18

df.iloc[[1,3,5],1:3]

	B	C
2024-01-02	5	6
2024-01-04	13	14
2024-01-06	21	22

mixed selection:ix 既有标签又有位置筛选

df

	A	B	C	D
2024-01-01	0	1	2	3
2024-01-02	4	5	6	7
2024-01-03	8	9	10	11
2024-01-04	12	13	14	15
2024-01-05	16	17	18	19
2024-01-06	20	21	22	23

df.ix[:3,['A','C']]  # anaconda中ix已被弃用

---------------------------------------------------------------------------

AttributeError                            Traceback (most recent call last)

Cell In[64], line 1
----> 1 df.ix[:3,['A','C']]


File D:\ProgramData\anaconda3\Lib\site-packages\pandas\core\generic.py:5902, in NDFrame.__getattr__(self, name)
   5895 if (
   5896     name not in self._internal_names_set
   5897     and name not in self._metadata
   5898     and name not in self._accessors
   5899     and self._info_axis._can_hold_identifiers_and_holds_name(name)
   5900 ):
   5901     return self[name]
-> 5902 return object.__getattribute__(self, name)

AttributeError: 'DataFrame' object has no attribute 'ix'

Boolean indexing

df

	A	B	C	D
2024-01-01	0	1	2	3
2024-01-02	4	5	6	7
2024-01-03	8	9	10	11
2024-01-04	12	13	14	15
2024-01-05	16	17	18	19
2024-01-06	20	21	22	23

df[df.A>8]

	A	B	C	D
2024-01-04	12	13	14	15
2024-01-05	16	17	18	19
2024-01-06	20	21	22	23

pandas设置值

df

	A	B	C	D
2024-01-01	0	1	2	3
2024-01-02	4	5	6	7
2024-01-03	8	9	10	11
2024-01-04	12	13	14	15
2024-01-05	16	17	18	19
2024-01-06	20	21	22	23

df.iloc[2,2]=1111

df

	A	B	C	D
2024-01-01	0	1	2	3
2024-01-02	4	5	6	7
2024-01-03	8	9	1111	11
2024-01-04	12	13	14	15
2024-01-05	16	17	18	19
2024-01-06	20	21	22	23

df.loc['20240102','C']=2222

df

	A	B	C	D
2024-01-01	0	1	2	3
2024-01-02	4	5	2222	7
2024-01-03	8	9	1111	11
2024-01-04	12	13	14	15
2024-01-05	16	17	18	19
2024-01-06	20	21	22	23

df[df.A>4]=0
df

	A	B	C	D
2024-01-01	0	1	2	3
2024-01-02	4	5	2222	7
2024-01-03	0	0	0	0
2024-01-04	0	0	0	0
2024-01-05	0	0	0	0
2024-01-06	0	0	0	0

dates = pd.date_range('20240101',periods=6)
df = pd.DataFrame(np.arange(24).reshape(6,4),index=dates,columns=['A','B','C','D'])
df

	A	B	C	D
2024-01-01	0	1	2	3
2024-01-02	4	5	6	7
2024-01-03	8	9	10	11
2024-01-04	12	13	14	15
2024-01-05	16	17	18	19
2024-01-06	20	21	22	23

df.A[df.A>4]=0
df

	A	B	C	D
2024-01-01	0	1	2	3
2024-01-02	4	5	6	7
2024-01-03	0	9	10	11
2024-01-04	0	13	14	15
2024-01-05	0	17	18	19
2024-01-06	0	21	22	23

dates = pd.date_range('20240101',periods=6)
dates

DatetimeIndex(['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04',
               '2024-01-05', '2024-01-06'],
              dtype='datetime64[ns]', freq='D')

df = pd.DataFrame(np.arange(24).reshape(6,4),index=dates,columns=['A','B','C','D'])
df

	A	B	C	D
2024-01-01	0	1	2	3
2024-01-02	4	5	6	7
2024-01-03	8	9	10	11
2024-01-04	12	13	14	15
2024-01-05	16	17	18	19
2024-01-06	20	21	22	23

df.B[df.A>2]=0
df

	A	B	C	D
2024-01-01	0	1	2	3
2024-01-02	4	0	6	7
2024-01-03	8	0	10	11
2024-01-04	12	0	14	15
2024-01-05	16	0	18	19
2024-01-06	20	0	22	23

df['F']=np.nan

df

	A	B	C	D	E	F
2024-01-01	0	1	2	3	1	NaN
2024-01-02	4	5	6	7	2	NaN
2024-01-03	8	9	10	11	3	NaN
2024-01-04	12	13	14	15	4	NaN
2024-01-05	16	17	18	19	5	NaN
2024-01-06	20	21	22	23	6	NaN

df['E']=pd.Series([1,2,3,4,5,6],index=pd.date_range('20240101',periods=6))
df

	A	B	C	D	E	F
2024-01-01	0	1	2	3	1	NaN
2024-01-02	4	5	6	7	2	NaN
2024-01-03	8	9	10	11	3	NaN
2024-01-04	12	13	14	15	4	NaN
2024-01-05	16	17	18	19	5	NaN
2024-01-06	20	21	22	23	6	NaN

df['E']=pd.Series([1,2,3,4,5,6],index=df.index)
df

	A	B	C	D	E	F
2024-01-01	0	1	2	3	1	NaN
2024-01-02	4	5	6	7	2	NaN
2024-01-03	8	9	10	11	3	NaN
2024-01-04	12	13	14	15	4	NaN
2024-01-05	16	17	18	19	5	NaN
2024-01-06	20	21	22	23	6	NaN

pandas处理丢失数据

import pandas as pd
import numpy as np

dates = pd.date_range('20240101',periods=6)
dates

DatetimeIndex(['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04',
               '2024-01-05', '2024-01-06'],
              dtype='datetime64[ns]', freq='D')

df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
df

	A	B	C	D
2024-01-01	0	1	2	3
2024-01-02	4	5	6	7
2024-01-03	8	9	10	11
2024-01-04	12	13	14	15
2024-01-05	16	17	18	19
2024-01-06	20	21	22	23

df.iloc[0,1]=np.nan
df.iloc[1,2]=np.nan
df

	A	B	C	D
2024-01-01	0	NaN	2.0	3
2024-01-02	4	5.0	NaN	7
2024-01-03	8	9.0	10.0	11
2024-01-04	12	13.0	14.0	15
2024-01-05	16	17.0	18.0	19
2024-01-06	20	21.0	22.0	23

df.dropna(axis=0,how='any')  # 丢掉含nan的行数据
# how={'any','all'} 
# 'any' 只要有nan就丢掉；'all'只有全部是nan就丢掉

	A	B	C	D
2024-01-03	8	9.0	10.0	11
2024-01-04	12	13.0	14.0	15
2024-01-05	16	17.0	18.0	19
2024-01-06	20	21.0	22.0	23

df.dropna(axis=1,how='any')  # 丢掉含nan的列数据

	A	D
2024-01-01	0	3
2024-01-02	4	7
2024-01-03	8	11
2024-01-04	12	15
2024-01-05	16	19
2024-01-06	20	23

df

	A	B	C	D
2024-01-01	0	NaN	2.0	3
2024-01-02	4	5.0	NaN	7
2024-01-03	8	9.0	10.0	11
2024-01-04	12	13.0	14.0	15
2024-01-05	16	17.0	18.0	19
2024-01-06	20	21.0	22.0	23

df.fillna(value=0)  #填补含nan的数据

	A	B	C	D
2024-01-01	0	0.0	2.0	3
2024-01-02	4	5.0	0.0	7
2024-01-03	8	9.0	10.0	11
2024-01-04	12	13.0	14.0	15
2024-01-05	16	17.0	18.0	19
2024-01-06	20	21.0	22.0	23

df.isnull()  #判断是否缺失数据（是否含有nan）

	A	B	C	D
2024-01-01	False	True	False	False
2024-01-02	False	False	True	False
2024-01-03	False	False	False	False
2024-01-04	False	False	False	False
2024-01-05	False	False	False	False
2024-01-06	False	False	False	False

np.any(df.isnull()) == True  # 判断整体数据是否含有nan数据

True

pandas导入导出数据

import pandas as pd

data = pd.read_csv('C:/Users/43160/Desktop/肝代码/Python/数据分析/实验数据/Advertising.csv')
data

	Number	TV	radio	newspaper	sales
0	1	230.1	37.8	69.2	22.1
1	2	44.5	39.3	45.1	10.4
2	3	17.2	45.9	69.3	9.3
3	4	151.5	41.3	58.5	18.5
4	5	180.8	10.8	58.4	12.9
...	...	...	...	...	...
195	196	38.2	3.7	13.8	7.6
196	197	94.2	4.9	8.1	9.7
197	198	177.0	9.3	6.4	12.8
198	199	283.6	42.0	66.2	25.5
199	200	232.1	8.6	8.7	13.4

200 rows × 5 columns

data.to_csv('advertising.csv')  #数据保存

pandas合并DataFrame

1.concatenating

import pandas as pd
import numpy as np

df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])

df1

	a	b	c	d
0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0

df2

	a	b	c	d
0	1.0	1.0	1.0	1.0
1	1.0	1.0	1.0	1.0
2	1.0	1.0	1.0	1.0

df3

	a	b	c	d
0	2.0	2.0	2.0	2.0
1	2.0	2.0	2.0	2.0
2	2.0	2.0	2.0	2.0

# 上下合并，即对行进行操作
res = pd.concat([df1,df2,df3],axis=0)
res

	a	b	c	d
0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0
0	1.0	1.0	1.0	1.0
1	1.0	1.0	1.0	1.0
2	1.0	1.0	1.0	1.0
0	2.0	2.0	2.0	2.0
1	2.0	2.0	2.0	2.0
2	2.0	2.0	2.0	2.0

res = pd.concat([df1,df2,df3],axis=0,ignore_index=True)
res

	a	b	c	d
0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0
3	1.0	1.0	1.0	1.0
4	1.0	1.0	1.0	1.0
5	1.0	1.0	1.0	1.0
6	2.0	2.0	2.0	2.0
7	2.0	2.0	2.0	2.0
8	2.0	2.0	2.0	2.0

#左右合并，即对列进行操作
res = pd.concat([df1,df2,df3],axis=1)
res

	a	b	c	d	a	b	c	d
0	1.0	1.0	1.0	1.0	2.0	2.0	2.0	2.0
1	1.0	1.0	1.0	1.0	2.0	2.0	2.0	2.0
2	1.0	1.0	1.0	1.0	2.0	2.0	2.0	2.0

join,[‘inner’,‘outer’]

df1 = pd.DataFrame(np.ones((3,4))*0,index=[1,2,3],columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,index=[2,3,4],columns=['b','c','d','e'])

df1

	a	b	c	d
1	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0

df2

	b	c	d	e
2	1.0	1.0	1.0	1.0
3	1.0	1.0	1.0	1.0
4	1.0	1.0	1.0	1.0

res = pd.concat([df1,df2])  # 默认"outer"模式
res

	a	b	c	d	e
1	0.0	0.0	0.0	0.0	NaN
2	0.0	0.0	0.0	0.0	NaN
3	0.0	0.0	0.0	0.0	NaN
2	NaN	1.0	1.0	1.0	1.0
3	NaN	1.0	1.0	1.0	1.0
4	NaN	1.0	1.0	1.0	1.0

res = pd.concat([df1,df2],join='outer')  # 类似并集
res

	a	b	c	d	e
1	0.0	0.0	0.0	0.0	NaN
2	0.0	0.0	0.0	0.0	NaN
3	0.0	0.0	0.0	0.0	NaN
2	NaN	1.0	1.0	1.0	1.0
3	NaN	1.0	1.0	1.0	1.0
4	NaN	1.0	1.0	1.0	1.0

res = pd.concat([df1,df2],join='inner')  # 类似交集
res

	b	c	d
1	0.0	0.0	0.0
2	0.0	0.0	0.0
3	0.0	0.0	0.0
2	1.0	1.0	1.0
3	1.0	1.0	1.0
4	1.0	1.0	1.0

res = pd.concat([df1,df2],join='inner',ignore_index=True)  # 重新排列索引
res

	b	c	d
0	0.0	0.0	0.0
1	0.0	0.0	0.0
2	0.0	0.0	0.0
3	1.0	1.0	1.0
4	1.0	1.0	1.0
5	1.0	1.0	1.0

join_axes

df1

	a	b	c	d
1	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0

df2

	b	c	d	e
2	1.0	1.0	1.0	1.0
3	1.0	1.0	1.0	1.0
4	1.0	1.0	1.0	1.0

res = pd.concat([df1,df2],axis=1)  
res

	a	b	c	d	b	c	d	e
1	0.0	0.0	0.0	0.0	NaN	NaN	NaN	NaN
2	0.0	0.0	0.0	0.0	1.0	1.0	1.0	1.0
3	0.0	0.0	0.0	0.0	1.0	1.0	1.0	1.0
4	NaN	NaN	NaN	NaN	1.0	1.0	1.0	1.0

res = pd.concat([df1,df2],axis=1,join_axes=[df1.index])  # 考虑df1的索引，但是已在anaconda中去除
res

---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

Cell In[161], line 1
----> 1 res = pd.concat([df1,df2],axis=1,join_axes=[df1.index])  # 考虑df1的索引，但是已在anaconda中去除
      2 res


File D:\ProgramData\anaconda3\Lib\site-packages\pandas\util\_decorators.py:331, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
    325 if len(args) > num_allow_args:
    326     warnings.warn(
    327         msg.format(arguments=_format_argument_list(allow_args)),
    328         FutureWarning,
    329         stacklevel=find_stack_level(),
    330     )
--> 331 return func(*args, **kwargs)


TypeError: concat() got an unexpected keyword argument 'join_axes'

2.append合并

df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])

df1

	a	b	c	d
0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0

df2

	a	b	c	d
0	1.0	1.0	1.0	1.0
1	1.0	1.0	1.0	1.0
2	1.0	1.0	1.0	1.0

res = df1.append(df2,ignore_index=True)
res

C:\Users\43160\AppData\Local\Temp\ipykernel_15804\3917667868.py:1: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  res = df1.append(df2,ignore_index=True)

	a	b	c	d
0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0
3	1.0	1.0	1.0	1.0
4	1.0	1.0	1.0	1.0
5	1.0	1.0	1.0	1.0

df3 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])

res = df1.append([df2,df3],ignore_index=True)
res

C:\Users\43160\AppData\Local\Temp\ipykernel_15804\3744420715.py:1: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  res = df1.append([df2,df3],ignore_index=True)

	a	b	c	d
0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0
3	1.0	1.0	1.0	1.0
4	1.0	1.0	1.0	1.0
5	1.0	1.0	1.0	1.0
6	1.0	1.0	1.0	1.0
7	1.0	1.0	1.0	1.0
8	1.0	1.0	1.0	1.0

res = df1.append([df2,df3])
res

C:\Users\43160\AppData\Local\Temp\ipykernel_15804\1214992729.py:1: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  res = df1.append([df2,df3])

	a	b	c	d
0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0
0	1.0	1.0	1.0	1.0
1	1.0	1.0	1.0	1.0
2	1.0	1.0	1.0	1.0
0	1.0	1.0	1.0	1.0
1	1.0	1.0	1.0	1.0
2	1.0	1.0	1.0	1.0

# 按行添加
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
s1 = pd.Series([1,2,3,4],index=['a','b','c','d']) 
s1

a    1
b    2
c    3
d    4
dtype: int64

res = df1.append(s1,ignore_index=True)
res

C:\Users\43160\AppData\Local\Temp\ipykernel_15804\2713288841.py:1: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  res = df1.append(s1,ignore_index=True)

	a	b	c	d
0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0
3	1.0	2.0	3.0	4.0

3.merge合并

import pandas as pd

left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                      'C': ['C0', 'C1', 'C2', 'C3'],
                      'D': ['D0', 'D1', 'D2', 'D3']})

right

	key	C	D
0	K0	C0	D0
1	K1	C1	D1
2	K2	C2	D2
3	K3	C3	D3

left

	key	A	B
0	K0	A0	B0
1	K1	A1	B1
2	K2	A2	B2
3	K3	A3	B3

res = pd.merge(left,right,on='key')   # 基于'key'进行合并
res

	key	A	B	C	D
0	K0	A0	B0	C0	D0
1	K1	A1	B1	C1	D1
2	K2	A2	B2	C2	D2
3	K3	A3	B3	C3	D3

consider two keys

left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                             'key2': ['K0', 'K1', 'K0', 'K1'],
                             'A': ['A0', 'A1', 'A2', 'A3'],
                             'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                              'key2': ['K0', 'K0', 'K0', 'K0'],
                              'C': ['C0', 'C1', 'C2', 'C3'],
                              'D': ['D0', 'D1', 'D2', 'D3']})

left

	key1	key2	A	B
0	K0	K0	A0	B0
1	K0	K1	A1	B1
2	K1	K0	A2	B2
3	K2	K1	A3	B3

right

	key1	key2	C	D
0	K0	K0	C0	D0
1	K1	K0	C1	D1
2	K1	K0	C2	D2
3	K2	K0	C3	D3

res = pd.merge(left,right,on=['key1','key2'])
res

	key1	key2	A	B	C	D
0	K0	K0	A0	B0	C0	D0
1	K1	K0	A2	B2	C1	D1
2	K1	K0	A2	B2	C2	D2

res = pd.merge(left,right,on=['key1','key2'],how='inner')  # 默认inner
res

	key1	key2	A	B	C	D
0	K0	K0	A0	B0	C0	D0
1	K1	K0	A2	B2	C1	D1
2	K1	K0	A2	B2	C2	D2

# how={'inner','outer','right','left'}

res = pd.merge(left,right,on=['key1','key2'],how='outer')  
res

	key1	key2	A	B	C	D
0	K0	K0	A0	B0	C0	D0
1	K0	K1	A1	B1	NaN	NaN
2	K1	K0	A2	B2	C1	D1
3	K1	K0	A2	B2	C2	D2
4	K2	K1	A3	B3	NaN	NaN
5	K2	K0	NaN	NaN	C3	D3

left

	key1	key2	A	B
0	K0	K0	A0	B0
1	K0	K1	A1	B1
2	K1	K0	A2	B2
3	K2	K1	A3	B3

right

	key1	key2	C	D
0	K0	K0	C0	D0
1	K1	K0	C1	D1
2	K1	K0	C2	D2
3	K2	K0	C3	D3

res = pd.merge(left,right,on=['key1','key2'],how='left')  
res

	key1	key2	A	B	C	D
0	K0	K0	A0	B0	C0	D0
1	K0	K1	A1	B1	NaN	NaN
2	K1	K0	A2	B2	C1	D1
3	K1	K0	A2	B2	C2	D2
4	K2	K1	A3	B3	NaN	NaN

res = pd.merge(left,right,on=['key1','key2'],how='right')  
res

	key1	key2	A	B	C	D
0	K0	K0	A0	B0	C0	D0
1	K1	K0	A2	B2	C1	D1
2	K1	K0	A2	B2	C2	D2
3	K2	K0	NaN	NaN	C3	D3

indicator

# indicator
df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']})
df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})

df1

	col1	col_left
0	0	a
1	1	b

df2

	col1	col_right
0	1	2
1	2	2
2	2	2

res = pd.merge(df1,df2,on='col1',how='outer',indicator=True)  # 显示merge方式是怎样merge的
res

	col1	col_left	col_right	_merge
0	0	a	NaN	left_only
1	1	b	2.0	both
2	2	NaN	2.0	right_only
3	2	NaN	2.0	right_only

res = pd.merge(df1,df2,on='col1',how='outer',indicator='indicator_columns')  # 显示merge方式是怎样merge的
res

	col1	col_left	col_right	indicator_columns
0	0	a	NaN	left_only
1	1	b	2.0	both
2	2	NaN	2.0	right_only
3	2	NaN	2.0	right_only

index

left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                                  'B': ['B0', 'B1', 'B2']},
                                  index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                                     'D': ['D0', 'D2', 'D3']},
                                      index=['K0', 'K2', 'K3'])

left

	A	B
K0	A0	B0
K1	A1	B1
K2	A2	B2

right

	C	D
K0	C0	D0
K2	C2	D2
K3	C3	D3

# left_index and right_index
res = pd.merge(left, right, left_index=True, right_index=True, how='outer')
res

	A	B	C	D
K0	A0	B0	C0	D0
K1	A1	B1	NaN	NaN
K2	A2	B2	C2	D2
K3	NaN	NaN	C3	D3

res = pd.merge(left, right, left_index=True, right_index=True, how='inner')
res

	A	B	C	D
K0	A0	B0	C0	D0
K2	A2	B2	C2	D2

# handle overlapping
boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})

boys

	k	age
0	K0	1
1	K1	2
2	K2	3

girls

	k	age
0	K0	4
1	K0	5
2	K3	6

res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner')  #处理名字相同，但是内涵不同的数据用suffixes
res

	k	age_boy	age_girl
0	K0	1	4
1	K0	1	5

pandas:数据可视化

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# plot data

#Series 一维数组，线性数据
data = pd.Series(np.random.randn(1000),index=np.arange(1000))
data

0      0.547677
1     -0.288794
2      0.556806
3      1.261752
4     -1.912560
         ...   
995    0.250478
996   -1.022430
997   -1.123374
998   -0.104338
999    1.049590
Length: 1000, dtype: float64

data =data.cumsum()  #累加
data.plot()    #输入数据
#plt.plot(x=,y=)   输入数据
plt.show()

在这里插入图片描述

# DataFrame
data = pd.DataFrame(np.random.randn(1000,4),
                   index=np.arange(1000),
                   columns=['A','B','C','D'])  #四个数据属性

data=data.cumsum()

data

	A	B	C	D
0	-1.854020	-1.031726	0.873153	1.601868
1	-2.494261	-1.244128	0.510932	2.150016
2	-2.516531	-2.961676	-0.284869	1.238185
3	-1.974520	-3.029144	-0.258707	1.761474
4	-2.170233	-2.911106	0.002738	1.778242
...	...	...	...	...
995	-15.542631	-8.357456	24.989268	-3.500648
996	-14.898920	-7.755639	24.748827	-3.434445
997	-15.438401	-10.115086	23.819015	-2.865272
998	-16.757351	-9.948964	24.401000	-1.790440
999	-18.415608	-10.377505	24.092952	-2.959285

1000 rows × 4 columns

data.head()

	A	B	C	D
0	-1.854020	-1.031726	0.873153	1.601868
1	-2.494261	-1.244128	0.510932	2.150016
2	-2.516531	-2.961676	-0.284869	1.238185
3	-1.974520	-3.029144	-0.258707	1.761474
4	-2.170233	-2.911106	0.002738	1.778242

data.plot()
plt.show()

在这里插入图片描述

# plot methods:
# 'bar'条形图, 'hist', 'box', 'kde', 'area', 'scatter'散点图, 'hexbin', 'pie'

#scatter只有两个属性，意为散点图
ax = data.plot.scatter(x='A', y='B', color='DarkBlue', label="Class 1")
plt.show()

在这里插入图片描述

ax = data.plot.scatter(x='A', y='B', color='DarkBlue', label="Class 1")
data.plot.scatter(x='A', y='C', color='LightGreen', label='Class 2', ax=ax)  # ax=ax表示将上面的属性添加到所展示的这条数据中
plt.show()

在这里插入图片描述

文章来源:https://blog.csdn.net/ccBcc_/article/details/135366905
本文来自互联网用户投稿，该文观点仅代表作者本人，不代表本站立场。本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如若内容造成侵权/违法违规/事实不符，请联系我的编程经验分享网邮箱：veading@qq.com进行投诉反馈，一经查实，立即删除！

	a	b	c	d
0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0
0	1.0	1.0	1.0	1.0
1	1.0	1.0	1.0	1.0
2	1.0	1.0	1.0	1.0
0	2.0	2.0	2.0	2.0
1	2.0	2.0	2.0	2.0
2	2.0	2.0	2.0	2.0

	a	b	c	d
0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0
3	1.0	1.0	1.0	1.0
4	1.0	1.0	1.0	1.0
5	1.0	1.0	1.0	1.0
6	2.0	2.0	2.0	2.0
7	2.0	2.0	2.0	2.0
8	2.0	2.0	2.0	2.0

	a	b	c	d	a	b	c	d
0	1.0	1.0	1.0	1.0	2.0	2.0	2.0	2.0
1	1.0	1.0	1.0	1.0	2.0	2.0	2.0	2.0
2	1.0	1.0	1.0	1.0	2.0	2.0	2.0	2.0

	a	b	c	d	e
1	0.0	0.0	0.0	0.0	NaN
2	0.0	0.0	0.0	0.0	NaN
3	0.0	0.0	0.0	0.0	NaN
2	NaN	1.0	1.0	1.0	1.0
3	NaN	1.0	1.0	1.0	1.0
4	NaN	1.0	1.0	1.0	1.0

	a	b	c	d	e
1	0.0	0.0	0.0	0.0	NaN
2	0.0	0.0	0.0	0.0	NaN
3	0.0	0.0	0.0	0.0	NaN
2	NaN	1.0	1.0	1.0	1.0
3	NaN	1.0	1.0	1.0	1.0
4	NaN	1.0	1.0	1.0	1.0

	b	c	d
1	0.0	0.0	0.0
2	0.0	0.0	0.0
3	0.0	0.0	0.0
2	1.0	1.0	1.0
3	1.0	1.0	1.0
4	1.0	1.0	1.0

	b	c	d
0	0.0	0.0	0.0
1	0.0	0.0	0.0
2	0.0	0.0	0.0
3	1.0	1.0	1.0
4	1.0	1.0	1.0
5	1.0	1.0	1.0

	a	b	c	d	b	c	d	e
1	0.0	0.0	0.0	0.0	NaN	NaN	NaN	NaN
2	0.0	0.0	0.0	0.0	1.0	1.0	1.0	1.0
3	0.0	0.0	0.0	0.0	1.0	1.0	1.0	1.0
4	NaN	NaN	NaN	NaN	1.0	1.0	1.0	1.0

	a	b	c	d
0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0
3	1.0	1.0	1.0	1.0
4	1.0	1.0	1.0	1.0
5	1.0	1.0	1.0	1.0

	a	b	c	d
0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0
3	1.0	1.0	1.0	1.0
4	1.0	1.0	1.0	1.0
5	1.0	1.0	1.0	1.0
6	1.0	1.0	1.0	1.0
7	1.0	1.0	1.0	1.0
8	1.0	1.0	1.0	1.0

	a	b	c	d
0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0
0	1.0	1.0	1.0	1.0
1	1.0	1.0	1.0	1.0
2	1.0	1.0	1.0	1.0
0	2.0	2.0	2.0	2.0
1	2.0	2.0	2.0	2.0
2	2.0	2.0	2.0	2.0

	a	b	c	d
0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0
3	1.0	1.0	1.0	1.0
4	1.0	1.0	1.0	1.0
5	1.0	1.0	1.0	1.0
6	2.0	2.0	2.0	2.0
7	2.0	2.0	2.0	2.0
8	2.0	2.0	2.0	2.0

	a	b	c	d	a	b	c	d
0	1.0	1.0	1.0	1.0	2.0	2.0	2.0	2.0
1	1.0	1.0	1.0	1.0	2.0	2.0	2.0	2.0
2	1.0	1.0	1.0	1.0	2.0	2.0	2.0	2.0

	a	b	c	d	e
1	0.0	0.0	0.0	0.0	NaN
2	0.0	0.0	0.0	0.0	NaN
3	0.0	0.0	0.0	0.0	NaN
2	NaN	1.0	1.0	1.0	1.0
3	NaN	1.0	1.0	1.0	1.0
4	NaN	1.0	1.0	1.0	1.0

	a	b	c	d	e
1	0.0	0.0	0.0	0.0	NaN
2	0.0	0.0	0.0	0.0	NaN
3	0.0	0.0	0.0	0.0	NaN
2	NaN	1.0	1.0	1.0	1.0
3	NaN	1.0	1.0	1.0	1.0
4	NaN	1.0	1.0	1.0	1.0

	b	c	d
1	0.0	0.0	0.0
2	0.0	0.0	0.0
3	0.0	0.0	0.0
2	1.0	1.0	1.0
3	1.0	1.0	1.0
4	1.0	1.0	1.0

	b	c	d
0	0.0	0.0	0.0
1	0.0	0.0	0.0
2	0.0	0.0	0.0
3	1.0	1.0	1.0
4	1.0	1.0	1.0
5	1.0	1.0	1.0

	a	b	c	d	b	c	d	e
1	0.0	0.0	0.0	0.0	NaN	NaN	NaN	NaN
2	0.0	0.0	0.0	0.0	1.0	1.0	1.0	1.0
3	0.0	0.0	0.0	0.0	1.0	1.0	1.0	1.0
4	NaN	NaN	NaN	NaN	1.0	1.0	1.0	1.0

	a	b	c	d
0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0
3	1.0	1.0	1.0	1.0
4	1.0	1.0	1.0	1.0
5	1.0	1.0	1.0	1.0

	a	b	c	d
0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0
3	1.0	1.0	1.0	1.0
4	1.0	1.0	1.0	1.0
5	1.0	1.0	1.0	1.0
6	1.0	1.0	1.0	1.0
7	1.0	1.0	1.0	1.0
8	1.0	1.0	1.0	1.0

	a	b	c	d
0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0
0	1.0	1.0	1.0	1.0
1	1.0	1.0	1.0	1.0
2	1.0	1.0	1.0	1.0
0	2.0	2.0	2.0	2.0
1	2.0	2.0	2.0	2.0
2	2.0	2.0	2.0	2.0

	a	b	c	d
0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0
3	1.0	1.0	1.0	1.0
4	1.0	1.0	1.0	1.0
5	1.0	1.0	1.0	1.0
6	2.0	2.0	2.0	2.0
7	2.0	2.0	2.0	2.0
8	2.0	2.0	2.0	2.0

	a	b	c	d	a	b	c	d
0	1.0	1.0	1.0	1.0	2.0	2.0	2.0	2.0
1	1.0	1.0	1.0	1.0	2.0	2.0	2.0	2.0
2	1.0	1.0	1.0	1.0	2.0	2.0	2.0	2.0

	a	b	c	d	e
1	0.0	0.0	0.0	0.0	NaN
2	0.0	0.0	0.0	0.0	NaN
3	0.0	0.0	0.0	0.0	NaN
2	NaN	1.0	1.0	1.0	1.0
3	NaN	1.0	1.0	1.0	1.0
4	NaN	1.0	1.0	1.0	1.0

	a	b	c	d	e
1	0.0	0.0	0.0	0.0	NaN
2	0.0	0.0	0.0	0.0	NaN
3	0.0	0.0	0.0	0.0	NaN
2	NaN	1.0	1.0	1.0	1.0
3	NaN	1.0	1.0	1.0	1.0
4	NaN	1.0	1.0	1.0	1.0

	b	c	d
1	0.0	0.0	0.0
2	0.0	0.0	0.0
3	0.0	0.0	0.0
2	1.0	1.0	1.0
3	1.0	1.0	1.0
4	1.0	1.0	1.0

	b	c	d
0	0.0	0.0	0.0
1	0.0	0.0	0.0
2	0.0	0.0	0.0
3	1.0	1.0	1.0
4	1.0	1.0	1.0
5	1.0	1.0	1.0

	a	b	c	d	b	c	d	e
1	0.0	0.0	0.0	0.0	NaN	NaN	NaN	NaN
2	0.0	0.0	0.0	0.0	1.0	1.0	1.0	1.0
3	0.0	0.0	0.0	0.0	1.0	1.0	1.0	1.0
4	NaN	NaN	NaN	NaN	1.0	1.0	1.0	1.0

	a	b	c	d
0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0
3	1.0	1.0	1.0	1.0
4	1.0	1.0	1.0	1.0
5	1.0	1.0	1.0	1.0

	a	b	c	d
0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0
3	1.0	1.0	1.0	1.0
4	1.0	1.0	1.0	1.0
5	1.0	1.0	1.0	1.0
6	1.0	1.0	1.0	1.0
7	1.0	1.0	1.0	1.0
8	1.0	1.0	1.0	1.0