头歌:Pandas合并数据集

2023-12-13 05:48:33

第1关?Concat与Append操作

import pandas as pd

def task1():
    #********** Begin **********#
    result = pd.concat([pd.read_csv('step1/data.csv'), pd.read_csv('step1/data1.csv')], axis=1)
    result.fillna(0)
    result.index = result['Ladder']
    result.drop('Ladder', axis=1, inplace=True)

    #********** End **********#
    return result

第2关?合并与连接

import pandas as pd


def task2(dataset1,dataset2,dataset3):

    # ********** Begin **********#
    data=pd.DataFrame({"city":['BeiJing','NanJing','BeiJing','TianJin'],'user_id':[1,2,3,4]})
    data1=pd.DataFrame({"page_click_count":[20,38,10],'user_id':[1,2,3]})
    data2=pd.DataFrame({"city":['TianJin','BeiJing','ShangHai','GuangZhou','ChangSha','HangZhou'],'id':[4,5,6,7,8,9],'page_click_count':[18,5,25,16,19,50]})
    
    data2.rename(columns={"id":"user_id"},inplace=True)
    result=pd.merge(data,data1,how='left',on="user_id")
    result=pd.concat([data2,result],ignore_index=True).sort_values("user_id")
    result=result.drop_duplicates("user_id")

    # df=pd.merge(data,data1,on='user_id')
    # df.rename(columns={"city":"city","user_id":"id","page_click_count":"page_click_count"},inplace=True)
    # result=pd.concat([df,data2],axis=0)
    # ********** End **********#
    return result

?第3关?案例:美国各州的统计数据

import pandas as pd
import numpy as np

def task3():
    #********** Begin **********#
    #读取三个csv文件
    pop = pd.read_csv('./step3/state-population.csv')
    areas = pd.read_csv('./step3/state-areas.csv')
    abbrevs = pd.read_csv('./step3/state-abbrevs.csv')
    # 合并pop和abbrevs并删除重复列
    data = pd.merge(pop, abbrevs, how='outer',
left_on='state/region', right_on='abbreviation')
    data = data.drop('abbreviation', axis=1)
    # 填充对应的全称
    data.loc[data['state/region'] == 'PR', 'state'] = 'Puerto Rico'
    data.loc[data['state/region'] == 'USA', 'state'] = 'United States'
    # 合并面积数据
    result = pd.merge(data, areas, on='state', how='left')
    # 删掉这些缺失值
    result.dropna(inplace=True)
    # 取year为2010年的两种人口的数据
    data2010_1= result.query("year == 2010 & ages == 'total'")
    data2010_2=result.query("year == 2010 & ages == 'under18'")
    # 二者人口相加
    p=np.array(data2010_1.loc[:,'population'])+np.array(data2010_2.loc[:,'population'])
    data2010=data2010_1.copy()
    data2010.loc[:,'population']=p
    #设置州为索引
    data2010.set_index('state', inplace=True)
    #计算人口密度
    density = data2010['population'] / data2010['area (sq. mi)']
    # 对值进行排序
    density.sort_values(ascending=False, inplace=True)
    # 输出人口密度前5名和倒数5名
    print('前5名:')
    print(density[:5])
    print('后5名:')
    print(density[-5:])
    #********** End **********#

文章来源:https://blog.csdn.net/m0_53208849/article/details/134913570
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。