2020年4月29日 10:10 by wst
数据抓取最近一直在研究广点通数据的抓取,终于搞定。
当用户名、密码以及验证错误不能通过的时候,只能使用cookie(作为凭证)来抓取数据。
那么就需要考虑以下几个问题:
1. 那些cookie是必须的?
2. 是否可以禁用cookie?
3. 这些必需的cookie怎样获取?
1. 从浏览器粘贴所有cookie之后,把它们粘贴到代码里,用程序一个一个试:先去掉cookie中的一个key,看能不能正确获取结果,然后再去掉一个,直至不能正确获取结果。
2. 一般情况是不能禁用cookie的,否则服务端根本不知道你是谁,肯定不会给你正确的响应。
3. 通过chrome的开发工具(F12)可以获取所有请求记录。详细过程如下:
(1)打开调试模式(F12),然后正常操作,直至想要的页面/数据出现;
(2)找到目标请求,看它用了那些cookie,比如abc;
(3)然后在网络监控窗口中搜索这个cookie的来源,搜索方法:set-cookie-name:abc;
(4)模拟这个请求,获取这个cookie。
(5)把获取到的cookie添加到目标请求中
获取依赖的cookie:
#!/usr/bin/env python
"""
FileName: qq_adnet_sso
Author: deepinwst
Email: movingheart000@gmail.com
Date: 2020/4/28 19:04:40
"""
import requests
from urllib.parse import urljoin
from utils import read_cookie
def get_adnet_sso(qq_num):
login = "https://e.qq.com/dev/login"
res_login = requests.get(login, allow_redirects=False)
cookie = read_cookie(qq_num)
print("res_login.headers:", res_login.headers)
# {'Date': 'Tue, 28 Apr 2020 08:57:05 GMT', 'Content-Length': '0', 'Connection': 'keep-alive', 'Server': 'nginx', 'Location': 'https://sso.e.qq.com/login?service_tag=14&sso_redirect_uri=https%3A%2F%2Fe.qq.com%2Fdev%2Fredirect'}
url1 = res_login.headers['Location']
cookie1 = cookie
header1 = {
'authority': 'sso.e.qq.com',
'pragma': 'no-cache',
'cache-control': 'no-cache',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'same-site',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'sec-fetch-dest': 'document',
'referer': 'https://e.qq.com/dev/index.html',
'accept-language': 'zh-CN,zh;q=0.9'
}
res_login1 = requests.get(url1, allow_redirects=False, headers=header1, cookies=cookie1)
print("res_login1.headers:", res_login1.headers)
# {'Date': 'Tue, 28 Apr 2020 09:04:09 GMT', 'Content-Length': '0', 'Connection': 'keep-alive', 'Location': '/login/hub?service_tag=14&sso_redirect_uri=https%3A%2F%2Fe.qq.com%2Fdev%2Fredirect'}
url2 = urljoin(url1, res_login1.headers['Location'])
cookie2 = cookie
header2 = {
'authority': 'sso.e.qq.com',
'pragma': 'no-cache',
'cache-control': 'no-cache',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'same-site',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'sec-fetch-dest': 'document',
'referer': 'https://e.qq.com/dev/index.html',
'accept-language': 'zh-CN,zh;q=0.9'
}
res_login2 = requests.get(url2, allow_redirects=False, headers=header2, cookies=cookie2)
print("res_login2.headers:", res_login2.headers)
# {'Date': 'Tue, 28 Apr 2020 09:10:37 GMT', 'Content-Length': '0', 'Connection': 'keep-alive', 'Location': 'https://e.qq.com/dev/redirect?sso_ticket=ST-07862-4wKWRcMaGSpzRYs5dtAPz4x1MnBgGwp5RPjY6Cfdf2bAFGN0pzY0mDaPhVTJ2tgX'}
url3 = res_login2.headers['Location']
header3 = {
'authority': 'e.qq.com',
'pragma': 'no-cache',
'cache-control': 'no-cache',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'same-site',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'sec-fetch-dest': 'document',
'referer': 'https://e.qq.com/dev/index.html',
'accept-language': 'zh-CN,zh;q=0.9'
}
res_login3 = requests.get(url3, allow_redirects=False, headers=header3)
print("res_login3.headers:", res_login3.headers)
# {'Date': 'Tue, 28 Apr 2020 09:14:31 GMT', 'Content-Length': '0', 'Connection': 'keep-alive', 'Server': 'nginx', 'Location': 'https://e.qq.com/dev/redirect', 'Set-Cookie': 'adnet_sso_flag=1; Max-Age=127168; Expires=Wed, 29 Apr 2020 20:33:59 GMT; Path=/; Domain=qq.com, adnet_sso=TGT-07862-1FvYp7p4rfpjpIIDgScxwwdSFp6Wcb9.4hak49Cx_8pijgCIx8u9BcGsCqFVSP9N; Max-Age=127168; Expires=Wed, 29 Apr 2020 20:33:59 GMT; Path=/; Domain=e.qq.com; HTTPOnly'}
return res_login3.cookies.get("adnet_sso", None)
if __name__ == "__main__":
print(get_adnet_sso("255****391"))
使用cookie模拟最终请求:
import os
import requests
import pandas as pd
from io import StringIO
import datetime
from copy import deepcopy
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from config import qq_paras
from utils import find_file
from utils import read_cookie, update_cookie, write_cookie
from qq_adnet_sso import get_adnet_sso
def qq_ad_data(qq_num="255****391"):
url = qq_paras['url']
form_data = deepcopy(qq_paras['form_data'])
headers = deepcopy(qq_paras['headers'])
date_str = (datetime.datetime.now() - datetime.timedelta(days=1)).strftime("%Y-%m-%d")
ori_cookies = read_cookie(qq_num)
print("type(ori_cookies):{}, ori_cookies:{}".format(type(ori_cookies), ori_cookies))
form_data['startDate'] = date_str
form_data['endDate'] = date_str
form_data['medium'] = qq_paras['number'][qq_num]['medium']
form_data['placement'] = qq_paras['number'][qq_num]['placement']
form_data['memberId'] = qq_paras['number'][qq_num]['memberId']
headers.pop('cookie', None)
my_cookie = qq_paras['number'][qq_num]['cookie']
adnet_sso = get_adnet_sso(qq_num)
if not adnet_sso:
print("获取cookie失败:{}".format(adnet_sso))
return "error"
my_cookie['adnet_sso'] = adnet_sso
res = requests.post(url=url, headers=headers, data=form_data, cookies=my_cookie)
# 状态异常
if res.status_code != 200:
print("error:{}".format(res.status_code))
return "error"
# 正确状态
if '时间' in res.text:
# 读取响应内容
print("res.text:\n{}".format(res.text))
fp = StringIO(res.text)
df = pd.read_csv(fp)
return df
else:
print("Nothing handle:{}".format(res.text))
return "error"
if __name__ == "__main__":
print(datetime.datetime.now().strftime("%Y-%m-%d %X"))
d = qq_ad_data(qq_num='168****825')
print(d)
如需要完整代码,请联系我。