@T爬虫百度百万高清美图源代码OC
#! -- coding: utf-8 --
Date:2020-09-20 16:52
USER:gordon_lu
使用正则表达式 删选指定的 URL 链接。
“”"
oo0oo
o8888888o
88" . “88
(| -- |)
0\ = /0
/ ‘—’ _
.’ | |/ ‘.
/ \||| : |||//
/ ||||| -卍- |||||
| | \\ - /// | |
| _| ‘’\ — /’’ | |
\ .-__ ‘-’ /-. /
, . ’ /–.--\ ’ ’
. "" ̄ ̄ <’ '. _<|>/.’ '> ̄ ̄ “” .
| | : ‘- \ .; ’ \ _ /’ ;, / - ’ : | |
\ \ '. _ __ \ / / . _.’ / /
===== '-.___ ‘.___ ___/.-’ _____.-’ =====
‘=—=’
“””
import requests
import re
import random
1,获取URL
word = input(“请输入你要爬虫的内容【暂时仅支持英文和字母】:”)
url = f’https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=&st=-1&fm=result&fr=&sf=1&fmq=1600592048477_R&pv=&ic=&nc=1&z=&hd=&latest=©right=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&sid=&word={word}’
def random_user_agent():
ulist=[
“Mozilla/5.0 (Windows NT 6.1;Win64;x86) AppleWebKit / 537.36 (KHTML, likeGecko) Chrome / 88.0.4183.102Safari / 537.36”,
“Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36”
“Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36”
]
return ulist[random.randint(0,len(ulist)-1)]
def get_image(url):
headers = {
“user-agent”: random_user_agent(),
“referer”: url,
}
result = requests.get(url,headers = headers).text
# print(result)
image_urls = re.findall('"objURL":"(.*?)"',result) # 找大图的 URL 确实不好找,这个技巧需要慢慢总结
for image_url in image_urls:
# print(image_url)
# 设置名字
image_name = image_url.split('/')[-1]
print(image_name)
image_end = re.search('(.jpg|.png|.jpeg|.gif)$',image_name)
if image_end ==None:
image_name = image_name + '.jpg'
image = requests.get(image_url).content
with open('./baidu_pic/%s'%image_name,'wb') as f:
f.write(image)
get_image(url)