背景
苦于没有合适工具统一方便地查看自己关注的股票的盘前股价。
思路
找到一个可以获取盘前数据接口,获取到数据后,再用Python爬取并展示出来。
搜索
打开雪球的个股页面,比如TQQQ。再打开网络控制台,过滤 XHR 类型请求,发现了盘前数据接口 https://stock.xueqiu.com/v5/stock/quote.json?symbol=TQQQ&extend=detail
返回数据样本如下
{
"data": {
"market": {
"status_id": 2,
"region": "US",
"status": "盘前交易",
"time_zone": "America/New_York",
"time_zone_desc": null
},
"quote": {
"current_ext": 43.19,
"symbol": "TQQQ",
"code": "TQQQ",
"high52w": 118.8,
"percent_ext": -2.29,
"avg_price": 42.884,
"delayed": 0,
"type": 4,
"percent": 5.92,
"tick_size": 0.01,
"float_shares": null,
"amplitude": 8.55,
"current": 44.2,
"high": 44.3399,
"current_year_percent": -48.93,
"float_market_capital": null,
"issue_date": 1265644800000,
"low": 40.77,
"sub_type": "8193",
"timestamp_ext": 1585908480287,
"market_capital": 2574650000,
"dividend": 0.03127,
"dividend_yield": 0.07494,
"currency": "USD",
"lot_size": 1,
"chg_ext": -1.01,
"lock_set": 1,
"navps": null,
"timestamp": 1585857600585,
"pe_lyr": null,
"amount": 2506199497,
"chg": 2.47,
"eps": null,
"last_close": 41.73,
"volume": 58441365,
"volume_ratio": null,
"pb": null,
"turnover_rate": 100.33,
"low52w": 32.27,
"name": "纳指3X做多-ProShares",
"pe_ttm": null,
"exchange": "NASDAQ",
"contract_size": 100,
"variable_tick_size": "0.0001 1 0.01",
"time": 1585857600585,
"total_shares": 58250000,
"open": 41.2,
"status": 1
},
"others": {
"pankou_ratio": 33.33
},
"tags": [
{
"description": "融",
"value": 6
},
{
"description": "空",
"value": 7
}
]
},
"error_code": 0,
"error_description": ""
}
编写爬虫函数
目录结构如下
.
├── json_utils.py
├── xueqiu_quote.py
└── quote.json
0 directories, 3 files
xueqiu_quote.py
从浏览器复制出 cURL 格式的请求之后,通过 https://curl.trillworks.com 转换成 Python 代码,使用的是 requests 网络库。
flat1
是从 data.market
结构里抽取数据,flat2
是从 data.quote
里抽取数据。
再把 get_quote
和 flat
两个函数作为管道形式结合成新的函数 quote_simple
,也就是爬取接口,并返回只包含所需要数据的简洁格式。这样就可以转换成 CSV 表格形式方便命令行查看了。
#!/usr/bin/env python3
import json, time
import os, sys
from functools import partial
import requests
from json_utils import *
TOKEN = os.getenv('XQ_TOKEN')
if not TOKEN:
print('missing XQ_TOKEN')
sys.exit(1)
def get_quote(symbol):
cookies = {'xq_a_token': TOKEN}
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:75.0) Gecko/20100101 Firefox/75.0',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.5',
'Referer': 'https://xueqiu.com/',
'Origin': 'https://xueqiu.com',
'Connection': 'Close',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
}
params = (('symbol', symbol), ('extend', 'detail'), ('_', time.time()))
response = requests.get(
'https://stock.xueqiu.com/v5/stock/quote.json',
headers=headers,
params=params,
cookies=cookies,
)
return response.json()
keys_raw_map = [
# ('current_ext', 'current_ext'),
('time', 'time'),
('exchange', 'exchange'),
('symbol', 'symbol'),
('name', 'name'),
# ('code', 'code'),
# ('high52w', 'high52w'),
# ('avg_price', 'avg_price'),
# ('delayed', 'delayed'),
# ('type', 'type'),
# ('tick_size', 'tick_size'),
# ('float_shares', 'float_shares'),
# ('amplitude', 'amplitude'),
# last day
('open', 'open'),
('low', 'low'),
('high', 'high'),
('last_close', 'last_close'),
# today
('current', 'current'),
('chg', 'chg'),
('percent', 'percent'),
# last vs today
('chg_ext', 'chg_ext'),
('percent_ext', 'percent_ext'),
# yearly
('current_year_percent', 'current_year_percent'),
# ('float_market_capital', 'float_market_capital'),
# ('issue_date', 'issue_date'),
# ('sub_type', 'sub_type'),
# ('timestamp_ext', 'timestamp_ext'),
# ('market_capital', 'market_capital'),
# ('dividend', 'dividend'),
# ('dividend_yield', 'dividend_yield'),
# ('currency', 'currency'),
# ('lot_size', 'lot_size'),
# ('lock_set', 'lock_set'),
# ('navps', 'navps'),
# ('timestamp', 'timestamp'),
# ('pe_lyr', 'pe_lyr'),
# ('amount', 'amount'),
# ('eps', 'eps'),
# ('volume', 'volume'),
# ('volume_ratio', 'volume_ratio'),
# ('pb', 'pb'),
# ('turnover_rate', 'turnover_rate'),
# ('low52w', 'low52w'),
# ('pe_ttm', 'pe_ttm'),
# ('contract_size', 'contract_size'),
# ('variable_tick_size', 'variable_tick_size'),
# ('total_shares', 'total_shares'),
# ('status', 'status'),
]
keys, keysmap = ['region', 'time_zone', 'status'], {}
flat1 = partial(flat_response, keys=keys, keysmap=keysmap, root_path='data.market')
keys, keysmap = parse_mapping(keys_raw_map)
flat2 = partial(flat_response, keys=keys, keysmap=keysmap, root_path='data.quote')
flat = merge_multi_flater([flat1, flat2])
quote_simple = compose_funcs([get_quote, flat, partial(update_ts, keys=['time'])])
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("symbol", nargs='+')
args = parser.parse_args()
data = [quote_simple(x) for x in args.symbol]
print(jdumps(data))
keys_raw_map
这个结构是从样本响应结果里转换过来的,只包含了所需要的字段,并且决定了后面 CSV 输出的表头顺序。这个顺序是由Python3 的 dict 是有序的来保证的。
json_utils.py
import json, os
from functools import partial
__all__ = [
'flat_response',
'compose_funcs',
'parse_mapping',
'merge_multi_flater',
'jdumps',
'update_ts',
]
DEBUG = os.getenv('DEBUG')
def g(data, ks):
if isinstance(ks, str):
ks = ks.split('.')
for k in ks:
data = data.get(k, {})
return data or None
jdumps = partial(json.dumps, ensure_ascii=False)
def compose_funcs(xs):
def fn(data):
for x in xs:
try:
data = x(data)
if DEBUG:
print('->', jdumps(data))
except Exception as e:
print(x, data)
raise
return data
return fn
def parse_mapping(xs):
"""
key
(key, value)
"""
rv_a = []
rv_b = {}
for x in xs:
if isinstance(x, (list, tuple)):
rv_b[x[0]] = x[1]
rv_a.append(x[0])
else:
rv_a.append(x)
return rv_a, rv_b
def flat_response(data, keys=[], keysmap={}, root_path=None):
if root_path:
x = g(data, root_path)
else:
x = data
return {keysmap.get(k, k): x[k] for k in keys}
def merge_multi_flater(funcs: list):
"""
element of funcs :: dict -> dict
"""
def fn(data):
rv = {}
for x in funcs:
rv.update(x(data))
return rv
return fn
def ts2date(ts):
from datetime import datetime
if ts > 1580000000000:
ts = ts / 1000
return datetime.fromtimestamp(ts)
def update_ts(data, keys=[], fmt='%Y-%m-%d %X'):
for k in keys:
data[k] = ts2date(data[k]).strftime(fmt)
return data
运行
python3 xueqiu_quote.py tqqq qqq lk v tsla erx | csv_json | csv /dev/stdin
其中 csv_json
命令把 JSON 格式输出的平面数据转换成 CSV 格式,csv
命令是把 CSV 格式渲染成表格。
效果如下:
问题
如图所示,存在返回过期数据的情况。但相同接口在浏览器打开又是正常的。怀疑是cookie的原因,确认了一遍,cookie 也是一样的。到这里就百思不得其解了 | _ |