目录
定义
全称: 'panel data' and 'python data analysis'
Analy: Series(一维数据)、DataFrame(二维数据)
主要应用: 数据清洗:处理缺失数据、重复数据等
数据转换:改变数据的形状、结构或格式
数据分析: 进行统计、聚合、分组
数据可视化:整合Matplotlib 进行数据可视化
基本操作
一维数组操作
import pandas as pd
a = [1, 2, 3]
mystr = pd.Series(a)
print('列表')
print(mystr)
a = ['a', 'b', 'c']
mystr = pd.Series(a, index=['x', 'y', 'z'])
print('自定义列名')
print(mystr)
str_dict = {'1': 'a', '2': 'b'}
mystr = pd.Series(str_dict)
print('字典类型')
print(mystr)
#############运行结果########################
# 列表
# 0 1
# 1 2
# 2 3
# dtype: int64
# 自定义列名
# x a
# y b
# z c
# dtype: object
# 字典类型
# 1 a
# 2 b
# dtype: object
二维数组操作
import pandas as pd
import numpy as np
# 从字典创建
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],
'Age': [25, 30, 35, 40],
'City': ['New York', 'Paris', 'London', 'Tokyo']}
df = pd.DataFrame(data)
print("从字典创建的DataFrame:")
print(df)
# 从列表创建
data = [['Apple', 10], ['Banana', 15], ['Orange', 8]]
df_fruits = pd.DataFrame(data, columns=['Fruit', 'Price'])
print("\n从列表创建的DataFrame:")
print(df_fruits)
# 查看前几行
print("\n前两行:")
print(df.head(2))
# 查看基本信息
print("\nDataFrame信息:")
print(df.info())
# 查看统计信息
print("\n统计信息:")
print(df.describe())
# 查看列名
print("\n列名:")
print(df.columns)
# 查看形状
print("\n形状(行,列):")
print(df.shape)
#############运行结果################
# 从字典创建的DataFrame:
# Name Age City
# 0 Alice 25 New York
# 1 Bob 30 Paris
# 2 Charlie 35 London
# 3 David 40 Tokyo
#
# 从列表创建的DataFrame:
# Fruit Price
# 0 Apple 10
# 1 Banana 15
# 2 Orange 8
#
# 前两行:
# Name Age City
# 0 Alice 25 New York
# 1 Bob 30 Paris
#
# DataFrame信息:
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 4 entries, 0 to 3
# Data columns (total 3 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 Name 4 non-null object
# 1 Age 4 non-null int64
# 2 City 4 non-null object
# dtypes: int64(1), object(2)
# memory usage: 224.0+ bytes
# None
#
# 统计信息:
# Age
# count 4.000000
# mean 32.500000
# std 6.454972
# min 25.000000
# 25% 28.750000
# 50% 32.500000
# 75% 36.250000
# max 40.000000
#
# 列名:
# Index(['Name', 'Age', 'City'], dtype='object')
#
# 形状(行,列):
# (4, 3)
数据选择过滤
import pandas as pd
import numpy as np
# 从字典创建
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],
'Age': [25, 30, 35, 40],
'City': ['New York', 'Paris', 'London', 'Tokyo']}
df = pd.DataFrame(data)
# 选择列
print("\n选择Name列:")
print(df['Name'])
# 选择多列
print("\n选择Name和Age列:")
print(df[['Name', 'Age']])
# 选择行
print("\n选择第二行:")
print(df.iloc[1])
# 选择多行
print("\n选择前两行:")
print(df.iloc[0:2])
# 条件选择
print("\n选择年龄大于30的人:")
print(df[df['Age'] > 30])
# 使用loc选择
print("\n使用loc选择:")
print(df.loc[df['Age'] > 30, ['Name', 'City']])
############运行结果######################
# 选择Name列:
# 0 Alice
# 1 Bob
# 2 Charlie
# 3 David
# Name: Name, dtype: object
#
# 选择Name和Age列:
# Name Age
# 0 Alice 25
# 1 Bob 30
# 2 Charlie 35
# 3 David 40
#
# 选择第二行:
# Name Bob
# Age 30
# City Paris
# Name: 1, dtype: object
#
# 选择前两行:
# Name Age City
# 0 Alice 25 New York
# 1 Bob 30 Paris
#
# 选择年龄大于30的人:
# Name Age City
# 2 Charlie 35 London
# 3 David 40 Tokyo
#
# 使用loc选择:
# Name City
# 2 Charlie London
# 3 David Tokyo
数据处理
数据清洗
import pandas as pd
import numpy as np
# 创建有缺失值的DataFrame
data = {'A': [1, 2, np.nan, 4],
'B': [5, np.nan, np.nan, 8],
'C': [10, 20, 30, 40]}
df_missing = pd.DataFrame(data)
print("\n有缺失值的DataFrame:")
print(df_missing)
# 检查缺失值
print("\n缺失值统计:")
print(df_missing.isnull().sum())
# 填充缺失值
print("\n填充缺失值:")
print(df_missing.fillna(value={'A': 0, 'B': df_missing['B'].mean()}))
# 删除缺失值
print("\n删除包含缺失值的行:")
print(df_missing.dropna())
# 删除全为缺失值的列
print("\n删除全为缺失值的列:")
print(df_missing.dropna(axis=1, how='all'))
#################运行结果#############################
# 有缺失值的DataFrame:
# A B C
# 0 1.0 5.0 10
# 1 2.0 NaN 20
# 2 NaN NaN 30
# 3 4.0 8.0 40
#
# 缺失值统计:
# A 1
# B 2
# C 0
# dtype: int64
#
# 填充缺失值:
# A B C
# 0 1.0 5.0 10
# 1 2.0 6.5 20
# 2 0.0 6.5 30
# 3 4.0 8.0 40
#
# 删除包含缺失值的行:
# A B C
# 0 1.0 5.0 10
# 3 4.0 8.0 40
#
# 删除全为缺失值的列:
# C
# 0 10
# 1 20
# 2 30
# 3 40
数据转换
import pandas as pd
# 从字典创建
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],
'Age': [25, 30, 35, 40],
'City': ['New York', 'Paris', 'London', 'Tokyo']}
df = pd.DataFrame(data)
print(df.dtypes)
# 类型转换
df['Age'] = df['Age'].astype('float')
print("\n转换Age列为浮点型:")
print(df.dtypes)
# 重命名列
df_renamed = df.rename(columns={'Name': 'Full Name', 'City': 'Location'})
print("\n重命名列后:")
print(df_renamed.columns)
# 替换值
df_replaced = df.replace({'Paris': 'Rome', 'London': 'Berlin'})
print("\n替换值后:")
print(df_replaced)
# 应用函数
print("\n应用函数:")
print(df['Age'].apply(lambda x: x + 1))
# 字符串操作
print("\n字符串操作:")
print(df['Name'].str.upper())
数据分析
排序
import pandas as pd
# 从字典创建
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],
'Age': [25, 30, 35, 40],
'City': ['New York', 'Paris', 'London', 'Tokyo']}
df = pd.DataFrame(data)
# 按列排序
print("\n按Age降序排序:")
print(df.sort_values('Age', ascending=False)) # ascending=False, 反序
# 多列排序
print("\n先按City升序,再按Age降序排序:")
print(df.sort_values(['City', 'Age'], ascending=[True, False]))
分组聚合
import pandas as pd
# 创建示例数据
data = {'Department': ['Sales', 'Sales', 'IT', 'IT', 'HR', 'HR'],
'Employee': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank'],
'Salary': [5000, 6000, 7000, 8000, 5500, 6500]}
df_company = pd.DataFrame(data)
# 分组聚合
print("\n按部门计算平均工资:")
print(df_company.groupby('Department')['Salary'].mean())
# 多聚合函数
print("\n按部门计算工资的多种统计量:")
print(df_company.groupby('Department')['Salary'].agg(['mean', 'max', 'min', 'count']))
# 多列分组
print("\n按部门分组并显示员工列表:")
print(df_company.groupby('Department')['Employee'].apply(list))
#############运行结果##################
# 按部门计算平均工资:
# Department
# HR 6000.0
# IT 7500.0
# Sales 5500.0
# Name: Salary, dtype: float64
#
# 按部门计算工资的多种统计量:
# mean max min count
# Department
# HR 6000.0 6500 5500 2
# IT 7500.0 8000 7000 2
# Sales 5500.0 6000 5000 2
#
# 按部门分组并显示员工列表:
# Department
# HR [Eve, Frank]
# IT [Charlie, David]
# Sales [Alice, Bob]
# Name: Employee, dtype: object
数据透视表
import pandas as pd
import numpy as np
# 创建示例数据
data = {'Date': ['2023-01-01', '2023-01-01', '2023-01-02', '2023-01-02'],
'Product': ['A', 'B', 'A', 'B'],
'Sales': [100, 150, 200, 50]}
df_sales = pd.DataFrame(data)
# 创建透视表
pivot = pd.pivot_table(df_sales, values='Sales', index='Date', columns='Product', aggfunc=np.sum)
print("\n透视表:")
print(pivot)
# 添加汇总
pivot_margins = pd.pivot_table(df_sales, values='Sales', index='Date', columns='Product',
aggfunc=np.sum, margins=True, margins_name='Total')
print("\n带汇总的透视表:")
print(pivot_margins)
# 透视表:
# Product A B
# Date
# 2023-01-01 100 150
# 2023-01-02 200 50
#
# 带汇总的透视表:
# Product A B Total
# Date
# 2023-01-01 100 150 250
# 2023-01-02 200 50 250
# Total 300 200 500
高级操作
合并数据
import pandas as pd
# 创建两个DataFrame
df1 = pd.DataFrame({'key': ['A', 'B', 'C'], 'value1': [1, 2, 3]})
df2 = pd.DataFrame({'key': ['B', 'C', 'D'], 'value2': [4, 5, 6]})
# 内连接
print("\n内连接:")
print(pd.merge(df1, df2, on='key', how='inner'))
# 左连接
print("\n左连接:")
print(pd.merge(df1, df2, on='key', how='left'))
# 左连接
print("\n右连接:")
print(pd.merge(df1, df2, on='key', how='right'))
# 外连接
print("\n外连接:")
print(pd.merge(df1, df2, on='key', how='outer'))
# 连接多个键
df3 = pd.DataFrame({'key1': ['A', 'B', 'C'], 'key2': ['X', 'Y', 'Z'], 'value': [10, 20, 30]})
df4 = pd.DataFrame({'key1': ['B', 'C', 'D'], 'key2': ['Y', 'Z', 'W'], 'value': [40, 50, 60]})
print("\n多键连接:")
print(pd.merge(df3, df4, on=['key1', 'key2'], how='inner'))
时间序列处理
import pandas as pd
import numpy as np
# 创建时间序列数据
dates = pd.date_range('20230101', periods=6)
df_time = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
print("\n时间序列DataFrame:")
print(df_time)
# 时间索引选择
print("\n选择特定日期范围:")
print(df_time['2023-01-02':'2023-01-04'])
# 重采样
print("\n按月重采样求均值:")
print(df_time.resample('ME').mean())
###################运行结果#########################
# 时间序列DataFrame:
# A B C D
# 2023-01-01 -0.800839 -0.953824 0.958557 1.044269
# 2023-01-02 0.203414 1.462324 0.099291 0.457230
# 2023-01-03 -0.113450 0.441178 -0.779749 1.158564
# 2023-01-04 -0.741451 -1.353738 1.906289 2.513789
# 2023-01-05 -1.719984 -0.305153 -1.283332 0.728910
# 2023-01-06 -0.363050 -1.591411 -2.289244 -0.076304
#
# 选择特定日期范围:
# A B C D
# 2023-01-02 0.203414 1.462324 0.099291 0.457230
# 2023-01-03 -0.113450 0.441178 -0.779749 1.158564
# 2023-01-04 -0.741451 -1.353738 1.906289 2.513789
#
# 按月重采样求均值:
# A B C D
# 2023-01-31 -0.589227 -0.383437 -0.231365 0.971077
自定义函数调用
import pandas as pd
# 从字典创建
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],
'Age': [25, 30, 35, 40],
'City': ['New York', 'Paris', 'London', 'Tokyo']}
df = pd.DataFrame(data)
# 使用apply应用自定义函数
def age_group(age):
if age < 30:
return 'Young'
elif age < 40:
return 'Middle'
else:
return 'Senior'
print("\n应用自定义函数:")
df['Age Group'] = df['Age'].apply(age_group)
print(df)
# 使用pipe进行方法链
def add_prefix(df, prefix):
df['Name'] = prefix + df['Name']
return df
print("\n使用pipe:")
print(df.pipe(add_prefix, prefix='Mr. '))
#####################运行结果###########################
# 应用自定义函数:
# Name Age City Age Group
# 0 Alice 25 New York Young
# 1 Bob 30 Paris Middle
# 2 Charlie 35 London Middle
# 3 David 40 Tokyo Senior
#
# 使用pipe:
# Name Age City Age Group
# 0 Mr. Alice 25 New York Young
# 1 Mr. Bob 30 Paris Middle
# 2 Mr. Charlie 35 London Middle
# 3 Mr. David 40 Tokyo Senior
数据可视化集成
import pandas as pd
import matplotlib.pyplot as plt
# 创建示例数据
df_plot = pd.DataFrame({
'Year': [2010, 2011, 2012, 2013, 2014],
'Sales': [100, 120, 150, 180, 200],
'Profit': [20, 25, 30, 35, 40]
})
# 绘制折线图
df_plot.plot(x='Year', y=['Sales', 'Profit'], kind='line', title='Sales and Profit Over Years')
plt.show()
# 绘制柱状图
df_plot.plot(x='Year', y='Sales', kind='bar', title='Sales by Year')
plt.show()
# 绘制散点图
df_plot.plot(x='Sales', y='Profit', kind='scatter', title='Profit vs Sales')
plt.show()
运行结果:
数据导出和导入
import pandas as pd
# 从字典创建
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],
'Age': [25, 30, 35, 40],
'City': ['New York', 'Paris', 'London', 'Tokyo']}
df = pd.DataFrame(data)
# 导出到CSV
df.to_csv('output.csv', index=False)
# 从CSV导入
df_imported = pd.read_csv('output.csv')
print("\n从CSV导入的数据:")
print(df_imported.head())
# 导出到Excel
df.to_excel('output.xlsx', sheet_name='Sheet1', index=False)
# 从Excel导入
df_excel = pd.read_excel('output.xlsx', sheet_name='Sheet1')
print("\n从Excel导入的数据:")
print(df_excel.head())
#####################运行结果################################
# 从CSV导入的数据:
# Name Age City
# 0 Alice 25 New York
# 1 Bob 30 Paris
# 2 Charlie 35 London
# 3 David 40 Tokyo
#
# 从Excel导入的数据:
# Name Age City
# 0 Alice 25 New York
# 1 Bob 30 Paris
# 2 Charlie 35 London
# 3 David 40 Tokyo
大数据分块处理
import pandas as pd
# 从字典创建
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],
'Age': [25, 30, 35, 40],
'City': ['Paris', 'Paris', 'London', 'London']}
df = pd.DataFrame(data)
# 导出到CSV
df.to_csv('output.csv', index=False)
# 分块读取大文件
chunk_size = 3
chunks = pd.read_csv('output.csv', chunksize=chunk_size)
# 分块处理
results = []
for chunk in chunks:
processed = chunk.groupby('City')['Age'].sum()
results.append(processed)
# 合并结果
final_result = pd.concat(results).groupby(level=0).sum()
print(final_result)
############运行结果##########################
# City
# London 75
# Paris 55
# Name: Age, dtype: int64
参考来源: deepseek。。。