1.2 金融数据处理
跳转到根目录:知行合一:投资篇
已完成:
1.1 编程基础
??1.1.1 投资-编程基础-numpy
??1.1.2 投资-编程基础-pandas
1.2 金融数据处理
文章目录
1. 股市数据获取
1.1. 使用qstock获取股票数据
目前看,使用qstock是最好的获取股票历史数据的包,因为:
tushare:基本都收费了,羊毛少了很多
akshare:貌似也可以的,等会儿研究
python爬虫:之前做过,用来爬雪球的历史数据,只要一开始写好了,后面也容易(第一次写就比较困难,要搞定登录cookie,下面也有源代码)
可以说,qstock是我目前用的最顺手最香的,谢谢大佬!
为了防止下车,把文档自己拷贝了一份,放在本地。《qstock基本使用》
import qstock as qs
# 如果没有安装qstock,执行:
# pip install qstock
# pip install pywencai
df=qs.get_data('512000') # 获取的是512000的所有日线历史行情数据,自己保存,不用每次都去拉,做一些简单算法、回测是够用了。
df.to_csv('512000.csv')
df
name code open high low close volume turnover turnover_rate
date
2016-09-14 券商ETF 512000 0.980 0.985 0.977 0.980 1177241 115431402.0 0.46
2016-09-19 券商ETF 512000 0.979 0.985 0.979 0.982 234915 23072514.0 0.09
2016-09-20 券商ETF 512000 0.982 0.983 0.978 0.980 172012 16855033.0 0.07
2016-09-21 券商ETF 512000 0.979 0.994 0.979 0.989 520398 51417210.0 0.20
2016-09-22 券商ETF 512000 0.998 0.998 0.988 0.989 264471 26239881.0 0.10
... ... ... ... ... ... ... ... ... ...
2023-12-22 券商ETF 512000 0.854 0.860 0.848 0.855 5963605 509122717.0 2.34
2023-12-25 券商ETF 512000 0.850 0.853 0.845 0.847 5823702 493326830.0 2.29
2023-12-26 券商ETF 512000 0.847 0.848 0.836 0.838 5493080 461334707.0 2.16
2023-12-27 券商ETF 512000 0.838 0.851 0.830 0.844 6957843 584933791.0 2.73
2023-12-28 券商ETF 512000 0.841 0.868 0.840 0.863 7554094 647459067.0 2.97
1771 rows × 9 columns
1.2. tushare
tushare官网:https://www.tushare.pro/
拿日线行情来看:
调取说明:120积分每分钟内最多调取500次,每次6000条数据,相当于单次提取23年历史
天意啊,我正好还有120积分,正好可以调用此接口。
import time
import tushare as ts
ts.set_token('你注册后拿到的token')
pro = ts.pro_api()
df = pro.daily(ts_code='000001.SZ', start_date='20230101', end_date='20231228')
# 将 日线行情 数据保存到csv
df.to_csv('000001.csv')
df
ts_code trade_date open high low close pre_close change pct_chg vol amount
0 000001.SZ 20231228 9.11 9.47 9.08 9.45 9.12 0.33 3.6184 1661591.84 1550256.591
1 000001.SZ 20231227 9.10 9.13 9.02 9.12 9.10 0.02 0.2198 641534.35 582036.661
2 000001.SZ 20231226 9.19 9.20 9.07 9.10 9.19 -0.09 -0.9793 541896.33 493746.623
3 000001.SZ 20231225 9.18 9.20 9.14 9.19 9.20 -0.01 -0.1087 413970.88 379638.234
4 000001.SZ 20231222 9.19 9.28 9.11 9.20 9.17 0.03 0.3272 1005645.02 924998.769
... ... ... ... ... ... ... ... ... ... ... ...
236 000001.SZ 20230109 14.75 14.88 14.52 14.80 14.62 0.18 1.2312 1057659.11 1561368.487
237 000001.SZ 20230106 14.50 14.72 14.48 14.62 14.48 0.14 0.9669 1195744.71 1747915.169
238 000001.SZ 20230105 14.40 14.74 14.37 14.48 14.32 0.16 1.1173 1665425.18 2417272.356
239 000001.SZ 20230104 13.71 14.42 13.63 14.32 13.77 0.55 3.9942 2189682.53 3110729.449
240 000001.SZ 20230103 13.20 13.85 13.05 13.77 13.16 0.61 4.6353 2194127.94 2971546.989
241 rows × 11 columns
1.3. python爬虫
这个自从有了qstock之后,就不香了,不过还是记录下来,后面也许可以再用到。
注意:代码里面需要把cookies_str
这个变量值替换成自己的雪球访问时的cookie字符串。
import json
import time
import requests as requests
import pandas as pd
def date_range(start, end, freq):
"""
构造时间区间,使用pandas函数
:param start: '2012-05-08'
:param end: '2023-09-01'
:param freq: '38D'
:return: ['2012-05-28', '2012-07-05', '2012-08-12', '2012-09-19'....'2023-07-16', '2023-08-23']
"""
# date_range = pd.date_range(start='2022-01-01', end='2022-06-17', freq='38D')
date_range = pd.date_range(start=start, end=end, freq=freq)
return date_range
class SecurityHistory:
"""
cookies_str 是要注意可能会过期,如果过期了,就重新通过浏览器手机模拟器获取一下。
"""
def __init__(self, symbol) -> None:
super().__init__()
self.symbol = symbol
self.fetch_days = {}
self.data_of_days = []
def do_request(self, begin_unix_time_str):
data = {
"symbol": self.symbol,
"begin": begin_unix_time_str,
"period": "day",
"type": "before",
"count": "38",
"indicator": "kline,pe,pb,ps,pcf,market_capital,agt,ggt,balance"
}
# 头部,伪装自己为浏览器
headers = {
'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1',
}
# cookie字符串,可以省去自己拼接k-v形式
cookies_str = '这里是自己的cookie字符串,从雪球请求F12,看请求的cookie,直接全部拷贝过来就可以了。'
cookies_dir = {cookie.split('=')[0]:cookie.split('=')[-1] for cookie in cookies_str.split('; ')}
# 调用请求
index_result = requests.get('https://stock.xueqiu.com/v5/stock/chart/kline.json',
params=data,
cookies=cookies_dir,
headers=headers)
# 打印结果
print('获取到结果\n' + index_result.text)
return index_result.text
def result_save(self, result_text):
# 请求结果字符串转换为json对象
result_json = json.loads(result_text)
last_record = {}
for item in result_json['data']['item']:
record = {}
# 日期格式化
date = time.strftime("%Y-%m-%d", time.localtime(item[0]/1000))
# 自行构造一个uuid,是为了判断此日期的数据有没有存在,存在就不必再保存了,不存在的才保存下来
record['uuid'] = result_json['data']['symbol'] + '|' + date
record['date'] = date
record['volume'] = item[1]
record['open'] = item[2]
record['high'] = item[3]
record['low'] = item[4]
record['close'] = item[5]
record['chg'] = item[6]
record['percent'] = item[7]
record['turnoverrate'] = item[8]
record['amount'] = item[9]
print(record)
if record['uuid'] in self.fetch_days:
print('已有当日数据')
else:
# self.fetch_days记录获取过数据的日期、data_of_days记录日线数据。
self.fetch_days[record['uuid']] = record['uuid']
self.data_of_days.append(record)
last_record = record
return last_record
if __name__ == '__main__':
security = 'SH510300'
start_date = '2012-05-28'
end_date = time.strftime("%Y-%m-%d", time.localtime())
# 构造请求时间区间
datetime_range = date_range(start=start_date, end=end_date, freq='38D')
this_class = SecurityHistory(security)
for item in datetime_range:
date_unix_time = str(int(time.mktime(item.timetuple()) * 1000)) # item是Timestamp类型 2012-05-28 00:00:00
# 调用请求接口获取数据
result_text = this_class.do_request(date_unix_time) # date_unix_time='1338134400000'
# 保存每日数据,这里存放在内存,并去重
this_class.result_save(result_text)
# 休眠,防止请求过于频繁被封
time.sleep(3)
# 数据存储到csv
df = pd.DataFrame(this_class.data_of_days)
# 默认自然序号的index会保存到csv文件
df.to_csv("{}-{}-{}.csv".format(security, start_date, end_date))
1.4. akshare
akshare的文档还是非常全面的,而且用起来也很ok,更而且,免费!
注意:股票数据和公募基金数据是不同的接口,这个是要注意的。
股票历史行情数据:https://www.akshare.xyz/data/stock/stock.html#id20
ETF基金历史行情-东财:https://www.akshare.xyz/data/fund/fund_public.html#id8
官网:https://www.akshare.xyz/index.html
安装akshare:https://www.akshare.xyz/installation.html
A股历史行情数据:https://www.akshare.xyz/data/stock/stock.html#id20
# pip install akshare --upgrade -i https://pypi.tuna.tsinghua.edu.cn/simple
import akshare as ak
stock_zh_a_hist_df = ak.stock_zh_a_hist(symbol="600036", period="daily")
stock_zh_a_hist_df
日期 开盘 收盘 最高 最低 成交量 成交额 振幅 \
0 2002-04-09 10.51 10.66 10.88 10.51 4141088 4.418822e+09 5.07
1 2002-04-10 10.66 10.60 10.70 10.39 679455 7.166843e+08 2.91
2 2002-04-11 10.60 10.52 10.68 10.49 227883 2.409635e+08 1.79
3 2002-04-12 10.50 10.57 10.64 10.48 212565 2.240599e+08 1.52
4 2002-04-15 10.57 10.39 10.60 10.35 185311 1.933069e+08 2.37
... ... ... ... ... ... ... ... ...
5200 2023-12-22 27.78 27.61 27.85 27.41 623644 1.721377e+09 1.59
5201 2023-12-25 27.60 27.53 27.72 27.42 419034 1.154001e+09 1.09
5202 2023-12-26 27.49 27.27 27.49 27.16 365257 9.971776e+08 1.20
5203 2023-12-27 27.33 27.30 27.37 27.05 410637 1.117439e+09 1.17
5204 2023-12-28 27.25 27.99 28.13 27.25 1061063 2.953201e+09 3.22
涨跌幅 涨跌额 换手率
0 46.03 3.36 69.02
1 -0.56 -0.06 11.32
2 -0.75 -0.08 3.80
3 0.48 0.05 3.54
4 -1.70 -0.18 3.09
... ... ... ...
5200 -0.36 -0.10 0.30
5201 -0.29 -0.08 0.20
5202 -0.94 -0.26 0.18
5203 0.11 0.03 0.20
5204 2.53 0.69 0.51
[5205 rows x 11 columns]
如果是要查ETF的数据,文章在这里:https://www.akshare.xyz/data/fund/fund_public.html#id8
import akshare as ak
fund_etf_hist_em_df = ak.fund_etf_hist_em(symbol="512000", period="daily")
print(fund_etf_hist_em_df)
日期 开盘 收盘 最高 最低 成交量 成交额 振幅 \
0 2016-09-14 0.980 0.980 0.985 0.977 1177241 115431402.0 0.00
1 2016-09-19 0.979 0.982 0.985 0.979 234915 23072514.0 0.61
2 2016-09-20 0.982 0.980 0.983 0.978 172012 16855033.0 0.51
3 2016-09-21 0.979 0.989 0.994 0.979 520398 51417210.0 1.53
4 2016-09-22 0.998 0.989 0.998 0.988 264471 26239881.0 1.01
... ... ... ... ... ... ... ... ...
1766 2023-12-22 0.854 0.855 0.860 0.848 5963605 509122717.0 1.41
1767 2023-12-25 0.850 0.847 0.853 0.845 5823702 493326830.0 0.94
1768 2023-12-26 0.847 0.838 0.848 0.836 5493080 461334707.0 1.42
1769 2023-12-27 0.838 0.844 0.851 0.830 6957843 584933791.0 2.51
1770 2023-12-28 0.841 0.863 0.868 0.840 7554094 647459067.0 3.32
涨跌幅 涨跌额 换手率
0 0.00 0.000 0.46
1 0.20 0.002 0.09
2 -0.20 -0.002 0.07
3 0.92 0.009 0.20
4 0.00 0.000 0.10
... ... ... ...
1766 0.12 0.001 2.34
1767 -0.94 -0.008 2.29
1768 -1.06 -0.009 2.16
1769 0.72 0.006 2.73
1770 2.25 0.019 2.97
[1771 rows x 11 columns]
2. 数据清洗、计算
2.1. dataframe整合历史数据
import qstock as qs
import pandas as pd
#默认日频率、前复权所有历史数据
#open:开盘价,high:最高价,low:最低价,close:收盘价 vol:成交量,turnover:成交金额,turnover_rate:换手率
# 沪深300, 中证500, 医药ETF, 券商ETF, 新能源ETF, 红利ETF, 黄金ETF, 房地产ETF
stocks_info = [
{'code': '510300', 'name': '沪深300'},
{'code': '510500', 'name': '中证500'},
{'code': '512010', 'name': '医药ETF'},
{'code': '512000', 'name': '券商ETF'},
{'code': '516160', 'name': '新能源ETF'},
{'code': '510800', 'name': '红利ETF'},
{'code': '518880', 'name': '黄金ETF'},
{'code': '512200', 'name': '房地产ETF'}
]
for stock in stocks_info:
df = qs.get_data(stock['code']) # 从qstock获取对应的股票历史数据
stock['history_df'] = df # 将其存在 history_df 这个key里面。
# 只保留收盘价,合并数据
df_all = pd.DataFrame()
for stock in stocks_info:
df = stock['history_df']
df = df[['close']] # 只需要 date 和 close 2列就行了。
df.rename(columns={'close': stock['name']}, inplace=True) # 用股票的名字来重命名close列
if df_all.size == 0:
df_all = df
else:
df_all = df_all.join(df) # join是按照index来连接的。
print(df_all)
沪深300 中证500 医药ETF 券商ETF 新能源ETF 红利ETF 黄金ETF 房地产ETF
date
2012-05-28 2.004 NaN NaN NaN NaN NaN NaN NaN
2012-05-29 2.044 NaN NaN NaN NaN NaN NaN NaN
2012-05-30 2.036 NaN NaN NaN NaN NaN NaN NaN
2012-05-31 2.030 NaN NaN NaN NaN NaN NaN NaN
2012-06-01 2.030 NaN NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ...
2023-12-25 3.415 5.403 0.404 0.847 0.615 1.016 4.634 0.522
2023-12-26 3.395 5.356 0.401 0.838 0.613 1.012 4.647 0.513
2023-12-27 3.405 5.372 0.403 0.844 0.608 1.013 4.660 0.514
2023-12-28 3.489 5.475 0.410 0.863 0.650 1.035 4.662 0.527
2023-12-29 3.489 5.496 0.410 0.861 0.642 1.034 4.642 0.526
[2822 rows x 8 columns]
2.2. 逐年收益率计算
import qstock as qs
import pandas as pd
#默认日频率、前复权所有历史数据
#open:开盘价,high:最高价,low:最低价,close:收盘价 vol:成交量,turnover:成交金额,turnover_rate:换手率
# 沪深300, 中证500, 医药ETF, 券商ETF, 新能源ETF, 红利ETF, 黄金ETF, 房地产ETF
stocks_info = [
{'code': '510300', 'name': '沪深300'},
{'code': '510500', 'name': '中证500'},
{'code': '512010', 'name': '医药ETF'},
{'code': '512000', 'name': '券商ETF'},
{'code': '516160', 'name': '新能源ETF'},
{'code': '510800', 'name': '红利ETF'},
# {'code': '518880', 'name': '黄金ETF'},
# {'code': '512200', 'name': '房地产ETF'}
]
for stock in stocks_info:
df = qs.get_data(stock['code']) # 从qstock获取对应的股票历史数据
stock['history_df'] = df # 将其存在 history_df 这个key里面。
# 准备计算数据
df_all = pd.DataFrame()
for stock in stocks_info:
df = stock['history_df']
df = df[['close']] # 只需要 date 和 close 2列就行了。
df.rename(columns={'close': stock['name']}, inplace=True) # 用股票的名字来重命名close列
if df_all.size == 0:
df_all = df
else:
df_all = df_all.join(df) # join是按照index来连接的。
print(df_all)
# 退化日期到年
yearly_pct_change = df_all.pct_change().to_period('A')
print('退化日期到年:', yearly_pct_change) # 这里可以看到输出的结果,其实就是把index的年月日格式,都变成年份,以方便后面的groupby分组计算。
# 按年分组,滚动计算收益率
y_ret = (yearly_pct_change.groupby(yearly_pct_change.index).apply(lambda x: ((1+x).cumprod()-1).iloc[-1])).round(4)
print('年分组滚动收益率:', y_ret)
沪深300 中证500 医药ETF 券商ETF 新能源ETF 红利ETF
date
2012-05-28 2.004 NaN NaN NaN NaN NaN
2012-05-29 2.044 NaN NaN NaN NaN NaN
2012-05-30 2.036 NaN NaN NaN NaN NaN
2012-05-31 2.030 NaN NaN NaN NaN NaN
2012-06-01 2.030 NaN NaN NaN NaN NaN
... ... ... ... ... ... ...
2023-12-25 3.415 5.403 0.404 0.847 0.615 1.016
2023-12-26 3.395 5.356 0.401 0.838 0.613 1.012
2023-12-27 3.405 5.372 0.403 0.844 0.608 1.013
2023-12-28 3.489 5.475 0.410 0.863 0.650 1.035
2023-12-29 3.498 5.518 0.413 0.864 0.648 1.036
[2822 rows x 6 columns]
退化日期到年:
沪深300 中证500 医药ETF 券商ETF 新能源ETF 红利ETF
date
2012 NaN NaN NaN NaN NaN NaN
2012 0.019960 NaN NaN NaN NaN NaN
2012 -0.003914 NaN NaN NaN NaN NaN
2012 -0.002947 NaN NaN NaN NaN NaN
2012 0.000000 NaN NaN NaN NaN NaN
... ... ... ... ... ... ...
2023 0.002642 -0.001294 0.002481 -0.009357 0.004902 0.000985
2023 -0.005857 -0.008699 -0.007426 -0.010626 -0.003252 -0.003937
2023 0.002946 0.002987 0.004988 0.007160 -0.008157 0.000988
2023 0.024670 0.019173 0.017370 0.022512 0.069079 0.021718
2023 0.002580 0.007854 0.007317 0.001159 -0.003077 0.000966
[2822 rows x 6 columns]
年分组滚动收益率:
沪深300 中证500 医药ETF 券商ETF 新能源ETF 红利ETF
date
2012 -0.0235 NaN NaN NaN NaN NaN
2013 -0.0756 0.1043 -0.0042 NaN NaN NaN
2014 0.6888 0.3900 0.0335 NaN NaN NaN
2015 0.0782 0.4757 0.3441 NaN NaN NaN
2016 -0.1120 -0.1640 -0.0090 -0.0112 NaN NaN
2017 0.2643 0.0056 0.2523 -0.0671 NaN NaN
2018 -0.2626 -0.3245 -0.1942 -0.2434 NaN -0.2092
2019 0.4246 0.2818 0.4277 0.4488 NaN 0.4099
2020 0.3066 0.2345 0.5844 0.1776 NaN 0.2925
2021 -0.0408 0.1725 -0.1571 -0.0343 0.4019 -0.0620
2022 -0.2044 -0.1835 -0.2449 -0.2618 -0.2727 -0.1679
2023 -0.0971 -0.0627 -0.1360 0.0385 -0.3435 -0.0952
3. 数据存储
3.1. 案例demo数据
本系列的文章用到的csv,都存放在:https://gitee.com/kelvin11/public-resources,数据截止日期:2023-12-26
代码 | 名称 | 地址 |
---|---|---|
159825 | 农业ETF | https://gitee.com/kelvin11/public-resources/blob/master/159825.csv |
164906 | 中概互联网LOF | https://gitee.com/kelvin11/public-resources/blob/master/164906.csv |
510300 | 沪深300ETF | https://gitee.com/kelvin11/public-resources/blob/master/510300.csv |
510500 | 中证500ETF | https://gitee.com/kelvin11/public-resources/blob/master/510500.csv |
512000 | 券商ETF | https://gitee.com/kelvin11/public-resources/blob/master/512000.csv |
512010 | 医药ETF | https://gitee.com/kelvin11/public-resources/blob/master/512010.csv |
513180 | 恒生科技指数ETF | https://gitee.com/kelvin11/public-resources/blob/master/513180.csv |
516160 | 新能源ETF | https://gitee.com/kelvin11/public-resources/blob/master/516160.csv |
600036 | 招商银行 | https://gitee.com/kelvin11/public-resources/blob/master/600036.csv |
510300 | 沪深300ETF (仅收盘价) | https://gitee.com/kelvin11/public-resources/blob/master/SH510300-close.csv |
510500 | 中证500ETF (仅收盘价) | https://gitee.com/kelvin11/public-resources/blob/master/SH510500-close.csv |
3.2. 数据库
一般还是选用mongodb来存储爬虫数据更配些。可以自己写一个连接,不过主要还是用pymongo的api多些。
mongodb菜鸟教程:https://www.runoob.com/mongodb/mongodb-tutorial.html
import pymongo
import time
class MongoUtil:
def __init__(self) -> None:
super().__init__()
self.mongo_meta = {'host': 'my.tengxun', 'port': '27017', 'user_name': 'admin', 'password': '123456',
'db': 'my_db'}
self.mongo_client = pymongo.MongoClient('mongodb://%s:%s@%s:%s/' % (
self.mongo_meta['user_name'], self.mongo_meta['password'], self.mongo_meta['host'], self.mongo_meta['port']))
# db默认配置为admin这个db
self.db = self.mongo_client[self.mongo_meta['db']]
def save_or_update_by_uuid(self, mongodb_collection, data_dict):
"""
约定:data_dict 是一个字典,必须要包含 uuid 字段,用来后面据此做更新。如果记录中没有 uuid 字段,就自己新建一个,保证记录唯一即可。
:param mongodb_collection: 集合名
:param data_dict: 数据字典
:return:
"""
uuid = data_dict['uuid']
detail_db_record = self.db[mongodb_collection].find_one({"uuid": uuid})
if detail_db_record is None:
# 主动给记录加一个 m_add_time、m_update_time 字段,设置为当前时间
data_dict['m_add_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
data_dict['m_update_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
ret = self.db[mongodb_collection].insert_one(data_dict)
return ret
else:
detail_db_record.update(data_dict)
detail_db_record['m_update_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
new_values = {"$set": detail_db_record}
ret = self.db[mongodb_collection].update_one({"uuid": uuid}, new_values)
return ret
def load_all_data(self, mongodb_collection):
"""
读取结束后,通过 for x in all_data 进行遍历
:param mongodb_collection:
:return:
"""
return self.db[mongodb_collection].find()#.sort("date", 1)
if __name__ == '__main__':
this = MongoUtil()
# {family: {"husband":"文豪"}} {house: {"$ne": null}} {"house": 1}
customer_dict = {"uuid": "234mjnlfjj", "name": "学英", "age": 18, "family": {"husband": "赵文豪"}}
customer_dict = {"uuid": "234mjnlfjj22222", "name": "刘x坤", "salary": 2000000, "house": [{"address": "江苏"}, {"address": "上海"}]}
this.save_or_update_by_uuid('test_storage', customer_dict)
# all_data = this.db['SH510500'].find({"date": {"$gt": "2013-03-15"}}).sort("date", 1)
# for x in all_data:
# print(x)
# 读取所有数据
# all_data = this.load_all_data('SH510500')
# print('读取结束,通过for x in all_data进行遍历')
# for x in all_data:
# print(x)
# 新增或更新
# mydict = {"uuid": "11111111", "name": "RUNOOB", "alexa": "100002222", "url": "https://www.runoob.com"}
# this.save_or_update_by_uuid('sites', mydict)
# print('插入或更新成功1')
# time.sleep(3)
# mydict['name'] = 'Kelvin'
# this.save_or_update_by_uuid('sites', mydict)
# print('插入或更新成功2')
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。 如若内容造成侵权/违法违规/事实不符,请联系我的编程经验分享网邮箱:veading@qq.com进行投诉反馈,一经查实,立即删除!