手撸爬虫爬取爱奇艺视频信息(2)

   日期:2020-07-14     浏览:236    评论:0    
核心提示:上文我们直接通过Xpath解析页面元素获取信息,很笨,这次我们换一种方式,通过找规律找到json数据直接获取解析json即可。比较简单代码如下:爬取爱奇艺华语院线电影和美国院线电影import jsonimport randomimport urllib.requestimport urllib.responseimport pymysqluseragent = [ Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Triden_www.91撸视频

上文我们直接通过Xpath解析页面元素获取信息,很笨,这次我们换一种方式,通过找规律找到json数据直接获取解析json即可。
比较简单代码如下:

爬取爱奇艺华语院线电影和美国院线电影

import json
import random
import urllib.request
import urllib.response
import pymysql

useragent = [
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
    "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
    "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
    "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)"]

headers1 = {
    'User-Agent': useragent[random.randint(0, 5)],
    "method": "GET",
    "authority": "list.iqiyi.com",
    "path": "/www/1/1-27815------------11-1-1-iqiyi--.html",
    "scheme": "https",
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,**;q=0.8,application/signed-exchange;v=b3",
    "accept-language": "zh-CN,zh;q=0.9",
    "cache-control": "max-age=0",
    "upgrade-insecure-requests": "1",
    "cookie": "QC005=b4b984320692d3b4314ab4e223adcd2d; QC173=0; Hm_lvt_53b7374a63c37483e5dd97d78d9bb36e=1585879290; QC175=%7B%22upd%22%3Atrue%2C%22ct%22%3A%22%22%7D; QC007=DIRECT; QC006=cxi755tl8psf5cdjn3675ae7; QC008=1585886016.1585886016.1585886016.1; nu=0; T00404=a8a63ad6d7b7e4dceba700e91b9dbbea; IMS=IggQABj_up30BSokCiAyMjMwMzVkODNlODEzNzZlMTk0ODg1N2ZkNzFkZTZjYRAAciQKIDIyMzAzNWQ4M2U4MTM3NmUxOTQ4ODU3ZmQ3MWRlNmNhEAA; QP001=1; QP0013=; QC010=33145914; Hm_lpvt_53b7374a63c37483e5dd97d78d9bb36e=1585888116; __dfp=a1c2f37ae6eb3d4d62ac566e461536f7169314d9d24b23dbb85ca81e8138c8394d@1587175290494@1585879291494"}

Ci = 1
Ui = 1
i = 1  # 控制华语院线页数
j = 1 # 控制美国院线页数

# 打开数据库连接
db = pymysql.connect("******", "****", "*******", "*******", port=3306, charset='utf8')

'''
# 华语院线
print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
print("华语华语华语华语华语华语华语华语华语华语华语华语华语华语华语华语华语华语华语华语华语华语")
print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")

ChineseUrl = "https://pcw-api.iqiyi.com/search/video/videolists?access_play_control_platform=14&channel_id=1&data_type=1&from=pcw_list&is_album_finished=&is_purchase=&key=&market_release_date_level=&mode=11&pageNum={}&pageSize=48&site=iqiyi&source_type=&three_category_id=1;must,27815;must&without_qipu=1"  # 华语院线 pageNum控制分页  共19页
print("###############################第{}页####################################".format(i))
ChineseUrl = ChineseUrl.format(i)
chineseRequest = urllib.request.Request(url=ChineseUrl, headers=headers1)
ChineseReponse = urllib.request.urlopen(chineseRequest)
Chinesehtml = ChineseReponse.read().decode('utf-8')
Chinesejson = json.loads(Chinesehtml)
while Ci <= 47:
    # 使用cursor()方法获取操作游标
    cursor = db.cursor()
    print("------------------------------------------------------------------------------------------------")
    print(Ci)
    print(Chinesejson['data']['list'][Ci]['name'])
    print(Chinesejson['data']['list'][Ci]['playUrl'])
    print(Chinesejson['data']['list'][Ci]['imageUrl'])
    print(Chinesejson['data']['list'][Ci]['score'])
    print(Chinesejson['data']['list'][Ci]['secondInfo'])
    # SQL 插入语句
    sql = """INSERT INTO iqiyiVideo(
                        name,score,videourl,imageurl,mainact,flag    
                        )
                        VALUES ('{0}','{1}','{2}','{3}','{4}','{5}')""".format(Chinesejson['data']['list'][Ci]['name'],
                                                                   Chinesejson['data']['list'][Ci]['score'],
                                                                   Chinesejson['data']['list'][Ci]['playUrl'],
                                                                   Chinesejson['data']['list'][Ci]['imageUrl'],
                                                                   Chinesejson['data']['list'][Ci]['secondInfo'],
                                                                   "chinese"
                                                                   )
    try:
        cursor.execute(sql)  # 执行sql语句
        db.commit()  # 提交到数据库执行
        print("插入成功!")
    except Exception as e:
        db.rollback()  # 如果发生错误则回滚
        print("插入失败{}".format(e))

    Ci = Ci + 1

db.close()
'''

# 美国院线
print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
print("美国美国美国美国美国美国美国美国美国美国美国美国美国美国美国美国美国美国美国美国美国美国")
print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
USAUrl = "https://pcw-api.iqiyi.com/search/video/videolists?access_play_control_platform=14&channel_id=1&data_type=1&from=pcw_list&is_album_finished=&is_purchase=&key=&market_release_date_level=&mode=11&pageNum={}&pageSize=48&site=iqiyi&source_type=&three_category_id=2;must,27815;must&without_qipu=1"  # 美国院线 pageNum控制分页  共12页
print("###############################第{}页####################################".format(j))
USAUrl = USAUrl.format(j)
USARequest = urllib.request.Request(url=USAUrl, headers=headers2)
USAReponse = urllib.request.urlopen(USARequest)
USAhtml = USAReponse.read().decode('utf-8')
USAjson = json.loads(USAhtml)
while Ui <= 47:
    # 使用cursor()方法获取操作游标
    cursor = db.cursor()
    print("------------------------------------------------------------------------------------------------")
    print(Ui)
    print(USAjson['data']['list'][Ui]['name'])
    print(USAjson['data']['list'][Ui]['playUrl'])
    print(USAjson['data']['list'][Ui]['imageUrl'])
    print(USAjson['data']['list'][Ui]['score'])
    print(USAjson['data']['list'][Ui]['secondInfo'])
    # SQL 插入语句
    sql = """INSERT INTO iqiyiVideo(
                           name,score,videourl,imageurl,mainact,flag    
                           )
                           VALUES ('{0}','{1}','{2}','{3}','{4}','{5}')""".format(
        USAjson['data']['list'][Ui]['name'],
        USAjson['data']['list'][Ui]['score'],
        USAjson['data']['list'][Ui]['playUrl'],
        USAjson['data']['list'][Ui]['imageUrl'],
        USAjson['data']['list'][Ui]['secondInfo'],
        "USA"
        )
    try:
        cursor.execute(sql)  # 执行sql语句
        db.commit()  # 提交到数据库执行

    except:
        db.rollback()  # 如果发生错误则回滚

    Ui = Ui + 1

db.close()
 
打赏
 本文转载自:网络 
所有权利归属于原作者,如文章来源标示错误或侵犯了您的权利请联系微信13520258486
更多>最近资讯中心
更多>最新资讯中心
0相关评论

推荐图文
推荐资讯中心
点击排行
最新信息
新手指南
采购商服务
供应商服务
交易安全
关注我们
手机网站:
新浪微博:
微信关注:

13520258486

周一至周五 9:00-18:00
(其他时间联系在线客服)

24小时在线客服