Life is short, You need Python
想搞点妹妹图片,一张一张的下载太麻烦…于是决定爬上一爬
目标网站:
分析目标网站,调节到移动模式,进行抓包分析,不难发现所有图片均以json字符串从服务端返回,这里怎么分析就不多介绍
分析单个request url:
反正就是很长的一串 一大堆参数 反正不知道是干嘛的 鼠标一直下滑 发现出现多个类似的请求url
比较多个 request url:
这里复制出来一部分:
Request URL:
https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%BC%A0%E5%AD%90%E6%9E%AB%E5%9B%BE%E7%89%87&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&hd=&latest=©right=&word=%E5%BC%A0%E5%AD%90%E6%9E%AB%E5%9B%BE%E7%89%87&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&expermode=&force=&cg=star&pn=210&rn=30&gsm=d2&1599125756815=
Request URL:
https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%BC%A0%E5%AD%90%E6%9E%AB%E5%9B%BE%E7%89%87&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&hd=&latest=©right=&word=%E5%BC%A0%E5%AD%90%E6%9E%AB%E5%9B%BE%E7%89%87&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&expermode=&force=&cg=star&pn=240&rn=30&gsm=f0&1599125756944=
Request URL:
https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%BC%A0%E5%AD%90%E6%9E%AB%E5%9B%BE%E7%89%87&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&hd=&latest=©right=&word=%E5%BC%A0%E5%AD%90%E6%9E%AB%E5%9B%BE%E7%89%87&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&expermode=&force=&cg=star&pn=270&rn=30&gsm=10e&1599126065176=
不难发现:
pn 参数呈现规律性的递增,有可能是每页显示的条数,最后面的参数像加密字符串暂且不管
咱们先抓取其中一个URL看看是什么结果:
import requests
import json
header = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36','referer': 'https://i.csdn.net/'}
def parse_url(url):
response = requests.get(url,headers=header)
return json.loads(response.content.decode())
print(parse_url("https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%BC%A0%E5%AD%90%E6%9E%AB%E5%9B%BE%E7%89%87&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&hd=&latest=©right=&word=%E5%BC%A0%E5%AD%90%E6%9E%AB%E5%9B%BE%E7%89%87&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&expermode=&force=&cg=star&pn=30&rn=30"))
得到结果:
{
"queryEnc": "%D5%C5%D7%D3%B7%E3%CD%BC%C6%AC",
"queryExt": "张子枫图片",
"listNum": 30,
"displayNum": 21792,
"gsm": "3c",
"bdFmtDispNum": "约21,700",
"bdSearchTime": "",
"isNeedAsyncRequest": 0,
"bdIsClustered": "1",
"data": [
{
"adType": "0",
"hasAspData": "0",
"thumbURL": "https://ss0.bdstatic.com/70cFuHSh_Q1YnxGkpoWK1HF6hhy/it/u=3618941107,2176593943&fm=26&gp=0.jpg",
"middleURL": "https://ss0.bdstatic.com/70cFuHSh_Q1YnxGkpoWK1HF6hhy/it/u=3618941107,2176593943&fm=26&gp=0.jpg",
"largeTnImageUrl": "",
"hasLarge": 0,
"hoverURL": "https://ss0.bdstatic.com/70cFuHSh_Q1YnxGkpoWK1HF6hhy/it/u=3618941107,2176593943&fm=26&gp=0.jpg",
"pageNum": 30,
"objURL": "ippr_z2C$qAzdH3FAzdH3Fvn_z&e3Biwtkw5_z&e3BvgAzdH3Ft42AzdH3Fa_a_8aa_aAzdH3F8c9ndbld9d_z&e3B9anAzdH3Fba0nl8lj0jv8m8kk99abjc10cbuvv0jj_z&e3B3r2",
"fromURL": "ippr_z2C$qAzdH3FAzdH3Frtv_z&e3Biwtkw5_z&e3Bv54AzdH3Ft4w2jAzdH3F8cdcblcm_z&e3Bip4s?ho=%Ec%BC%Aa%Ec%AD%la%Em%lE%AB&fhtr=nd",
"fromURLHost": "pic.haibao.com",
"currentIndex": "",
"width": 1024,
"height": 961,
"type": "jpg",
"is_gif": 0,
"isCopyright": 0,
"strategyAssessment": "1249909234_30_0_0",
"filesize": "",
"bdSrcType": "0",
"di": "27610",
"pi": "0",
"is": "0,0",
"imgCollectionWord": "",
"replaceUrl": [
{
"ObjURL": "http://img2.imgtn.bdimg.com/it/u=3618941107,2176593943&fm=214&gp=0.jpg",
"ObjUrl": "http://img2.imgtn.bdimg.com/it/u=3618941107,2176593943&fm=214&gp=0.jpg",
"FromURL": "http://fashion.ifeng.com/c/7ushnwc2kg2",
"FromUrl": "http://fashion.ifeng.com/c/7ushnwc2kg2"
},
{
"ObjURL": "http://imgboys1.yohobuy.com/cmsimg01/2019/08/21/03/03/0188975a38b1fdbba46f4884c50910a6a6.jpeg",
"ObjUrl": "http://imgboys1.yohobuy.com/cmsimg01/2019/08/21/03/03/0188975a38b1fdbba46f4884c50910a6a6.jpeg",
"FromURL": "http://www.yohoboys.com/channel/detail/release/id/84361/app/",
"FromUrl": "http://www.yohoboys.com/channel/detail/release/id/84361/app/"
}
],
"hasThumbData": "0",
"bdSetImgNum": 0,
"partnerId": 0,
"spn": 0,
"bdImgnewsDate": "2020-01-19 01:43",
"fromPageTitle": "<strong>张子枫</strong>",
"fromPageTitleEnc": "张子枫",
"bdSourceName": "",
"bdFromPageTitlePrefix": "",
"isAspDianjing": 0,
"token": "",
"imgType": "",
"cs": "3618941107,2176593943",
"os": "840935132,1601297368",
"simid": "2994841692,3622017507",
"personalized": "0",
"simid_info": null,
"face_info": null,
"xiangshi_info": null,
"adPicId": "0",
"source_type": ""
},
......
}
只显示部分 到了这里从json数组中不难看出我们想要的东西 发现"listNum": 30,由此可见pn就是每页显示的条数
这里发现原来百度图库的图片基本也都是爬别人的,就没有一张高清图片
咱们去掉url最后面的参数 同时改变pn参数的值 注意每次加30:
经过再次抓取发现也能返回成功,到这一步,就可以开始整个图片的抓取了
直接上代码了:
from urllib import request #引入网页分解析器
import requests
import json #引入json处理扩展
class ZzfSpider:
def __init__(self):
self.templete_url = "https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%BC%A0%E5%AD%90%E6%9E%AB%E5%9B%BE%E7%89%87&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&hd=&latest=©right=&word=%E5%BC%A0%E5%AD%90%E6%9E%AB%E5%9B%BE%E7%89%87&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&expermode=&force=&cg=star&pn={}&rn=30"#构建基础URL
self.total = 30
self.i = 0
# 处理内容
def get_content_list(self, dict_str):
return dict_str['data'], dict_str['displayNum']
# 保存内容
def save_content_list(self, content_list,i):
j = 0
with open("zzf_image_spider.text", "a") as f:
for content in content_list:
if content:#内容不为空则写入文本 并保存图片
request.urlretrieve(content['middleURL'], "zzf/%s.jpg" % content['strategyAssessment']) #创建图片
f.write(json.dumps(content, ensure_ascii=False, indent=2)) #将字典类型转化成json字符串写入文本
f.write("\n")
j += 1
page = i*30+j
print('保存成功,合计%s张图片' % page)
# 发送请求获得数据
def parse_url(self, url):
headers = {
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36",
"referer": "https://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&ie=gb18030&word=%D5%C5%D7%D3%B7%E3%CD%BC%C6%AC&fr=ala&ala=1&alatpl=star&pos=0&hs=2&xthttps=111111",
}
response = requests.get(url, headers=headers)
json_str = response.content.decode()
dict_str = json.loads(json_str)#将数据转换成python字典类型
return dict_str
# 主方法
def run(self):
num = 30
total = self.total
i = self.i
while True:
# 1 获取基础url
url = self.templete_url.format(num)
# print(url)
# 2 发送请求 获取相应
dict_str = self.parse_url(url)
# 3 数据处理
content_list, total = self.get_content_list(dict_str)
if total == 0:
break
else:
# 4 数据保存
self.save_content_list(content_list,i)
i += 1
# 5 构造下一页url
num += 30
# print(content_list)
if __name__ == '__main__':
ZzfSpider().run()
这里有一点 由于url中参数的值是累加的,抓取过程中发现到了pn=1350的时候,抓取数据返回空值,所以得在程序中做一个判断
最后程序效果:
视频展示:
python爬虫张子枫图片
由于图片太多 本次只抓取了1000多图片张变终止了程序
最后附上张子枫图片合集百度网盘链接:
https://pan.baidu.com/s/1Jmd2BHCM3oY1DM2AqBZjuw 密码:hk0h