上文我们直接通过Xpath解析页面元素获取信息,很笨,这次我们换一种方式,通过找规律找到json数据直接获取解析json即可。
比较简单代码如下:
爬取爱奇艺华语院线电影和美国院线电影
import json
import random
import urllib.request
import urllib.response
import pymysql
useragent = [
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)"]
headers1 = {
'User-Agent': useragent[random.randint(0, 5)],
"method": "GET",
"authority": "list.iqiyi.com",
"path": "/www/1/1-27815------------11-1-1-iqiyi--.html",
"scheme": "https",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,**;q=0.8,application/signed-exchange;v=b3",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "max-age=0",
"upgrade-insecure-requests": "1",
"cookie": "QC005=b4b984320692d3b4314ab4e223adcd2d; QC173=0; Hm_lvt_53b7374a63c37483e5dd97d78d9bb36e=1585879290; QC175=%7B%22upd%22%3Atrue%2C%22ct%22%3A%22%22%7D; QC007=DIRECT; QC006=cxi755tl8psf5cdjn3675ae7; QC008=1585886016.1585886016.1585886016.1; nu=0; T00404=a8a63ad6d7b7e4dceba700e91b9dbbea; IMS=IggQABj_up30BSokCiAyMjMwMzVkODNlODEzNzZlMTk0ODg1N2ZkNzFkZTZjYRAAciQKIDIyMzAzNWQ4M2U4MTM3NmUxOTQ4ODU3ZmQ3MWRlNmNhEAA; QP001=1; QP0013=; QC010=33145914; Hm_lpvt_53b7374a63c37483e5dd97d78d9bb36e=1585888116; __dfp=a1c2f37ae6eb3d4d62ac566e461536f7169314d9d24b23dbb85ca81e8138c8394d@1587175290494@1585879291494"}
Ci = 1
Ui = 1
i = 1 # 控制华语院线页数
j = 1 # 控制美国院线页数
# 打开数据库连接
db = pymysql.connect("******", "****", "*******", "*******", port=3306, charset='utf8')
'''
# 华语院线
print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
print("华语华语华语华语华语华语华语华语华语华语华语华语华语华语华语华语华语华语华语华语华语华语")
print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
ChineseUrl = "https://pcw-api.iqiyi.com/search/video/videolists?access_play_control_platform=14&channel_id=1&data_type=1&from=pcw_list&is_album_finished=&is_purchase=&key=&market_release_date_level=&mode=11&pageNum={}&pageSize=48&site=iqiyi&source_type=&three_category_id=1;must,27815;must&without_qipu=1" # 华语院线 pageNum控制分页 共19页
print("###############################第{}页####################################".format(i))
ChineseUrl = ChineseUrl.format(i)
chineseRequest = urllib.request.Request(url=ChineseUrl, headers=headers1)
ChineseReponse = urllib.request.urlopen(chineseRequest)
Chinesehtml = ChineseReponse.read().decode('utf-8')
Chinesejson = json.loads(Chinesehtml)
while Ci <= 47:
# 使用cursor()方法获取操作游标
cursor = db.cursor()
print("------------------------------------------------------------------------------------------------")
print(Ci)
print(Chinesejson['data']['list'][Ci]['name'])
print(Chinesejson['data']['list'][Ci]['playUrl'])
print(Chinesejson['data']['list'][Ci]['imageUrl'])
print(Chinesejson['data']['list'][Ci]['score'])
print(Chinesejson['data']['list'][Ci]['secondInfo'])
# SQL 插入语句
sql = """INSERT INTO iqiyiVideo(
name,score,videourl,imageurl,mainact,flag
)
VALUES ('{0}','{1}','{2}','{3}','{4}','{5}')""".format(Chinesejson['data']['list'][Ci]['name'],
Chinesejson['data']['list'][Ci]['score'],
Chinesejson['data']['list'][Ci]['playUrl'],
Chinesejson['data']['list'][Ci]['imageUrl'],
Chinesejson['data']['list'][Ci]['secondInfo'],
"chinese"
)
try:
cursor.execute(sql) # 执行sql语句
db.commit() # 提交到数据库执行
print("插入成功!")
except Exception as e:
db.rollback() # 如果发生错误则回滚
print("插入失败{}".format(e))
Ci = Ci + 1
db.close()
'''
# 美国院线
print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
print("美国美国美国美国美国美国美国美国美国美国美国美国美国美国美国美国美国美国美国美国美国美国")
print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
USAUrl = "https://pcw-api.iqiyi.com/search/video/videolists?access_play_control_platform=14&channel_id=1&data_type=1&from=pcw_list&is_album_finished=&is_purchase=&key=&market_release_date_level=&mode=11&pageNum={}&pageSize=48&site=iqiyi&source_type=&three_category_id=2;must,27815;must&without_qipu=1" # 美国院线 pageNum控制分页 共12页
print("###############################第{}页####################################".format(j))
USAUrl = USAUrl.format(j)
USARequest = urllib.request.Request(url=USAUrl, headers=headers2)
USAReponse = urllib.request.urlopen(USARequest)
USAhtml = USAReponse.read().decode('utf-8')
USAjson = json.loads(USAhtml)
while Ui <= 47:
# 使用cursor()方法获取操作游标
cursor = db.cursor()
print("------------------------------------------------------------------------------------------------")
print(Ui)
print(USAjson['data']['list'][Ui]['name'])
print(USAjson['data']['list'][Ui]['playUrl'])
print(USAjson['data']['list'][Ui]['imageUrl'])
print(USAjson['data']['list'][Ui]['score'])
print(USAjson['data']['list'][Ui]['secondInfo'])
# SQL 插入语句
sql = """INSERT INTO iqiyiVideo(
name,score,videourl,imageurl,mainact,flag
)
VALUES ('{0}','{1}','{2}','{3}','{4}','{5}')""".format(
USAjson['data']['list'][Ui]['name'],
USAjson['data']['list'][Ui]['score'],
USAjson['data']['list'][Ui]['playUrl'],
USAjson['data']['list'][Ui]['imageUrl'],
USAjson['data']['list'][Ui]['secondInfo'],
"USA"
)
try:
cursor.execute(sql) # 执行sql语句
db.commit() # 提交到数据库执行
except:
db.rollback() # 如果发生错误则回滚
Ui = Ui + 1
db.close()