selenium+Xpath+csv爬取京东商品信息
- selenium爬取京东的手机商品信息
- python的入门库
- 查看网页源码
- 代码
- 总结
selenium爬取京东的手机商品信息
利用pycharm爬取京东商城的手机商品的信息(价格,型号,样式,或者内存详细信息)
python的入门库
师傅领进门修行在个人!!!
import requests#请求库
from bs4 import BeautifulSoup #解析网页用
import lxml #解析网页
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0'
'Referer':'https://www.jd.com/'
}#设置请求头模拟浏览器访问,防止反爬虫系统
url = 'https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&wq=%E6%89%8B%E6%9C%BA&pvid=0184af803e7e4c309dde99e28a40547d'
r.requests.get(url,headers = headers)#向网页发送请求
#print(r.status_code)#返回状态码
#print(r.text)#返回网页代码
soup = BeautifulSoup(r.text,'lxml')#解析网页
info_frist = soup.find_all(attrs = {'class':'gl-i-wrap'})
#根据所需要爬取的东西的网页代码id进行爬取
for title in info_frist:
print(title.text.replace('n',''))
#遍历info_获取的信息进行格式化输出
查看网页源码
打开京东商城搜索手机,注意要再点击一下分类处的手机(下图),否则会有其他收集相关的产品干扰信息,第二个图为手机商品图
F12大法,定位到需要爬取位置的源码,得你所得。
下一步打开pychrm(python软件,不会还有人不会配环境吧,不会吧不会吧)
代码
**话不多说直接上代码**
**都带注释的,不懂的再私信我,在线答疑**
import csv
import json
import random
import re
import time
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
#声明插件的路径
driver_path = r'F:\Pycharm\chromedriver.exe'
#声明一个谷歌插件,不加载图片#不加载图片节省时间
options = webdriver.ChromeOptions()
options.add_experimental_option('prefs', {'profile.managed_default_content_settings.images': 2})
#设置属性
driver = webdriver.Chrome(executable_path=driver_path,options=options)
#url
url = 'https://search.jd.com/search?keyword=%E6%89%8B%E6%9C%BA&cid3=655&cid2=653&page=1&s=1&click=0'
#声明一个list,用来存储dict
data_list = []
def start_spider():
# 请求url
driver.get(url)
# 显示等待下一页的元素加载完成
WebDriverWait(driver, 1000).until(
EC.presence_of_all_elements_located(
(By.CLASS_NAME, 'pn-next')
)
)
# 先获取一个有多少页
all_page = eval(driver.find_element_by_css_selector('span.p-skip>em>b').text)
print(all_page)
# 设置一个计数器
count = 0
while True:
try:
count += 1
WebDriverWait(driver, 1000).until(
EC.presence_of_all_elements_located(
(By.CLASS_NAME, 'gl-item')
)
)# 显示等待商品信息加载完成
driver.execute_script('document.documentElement.scrollTop=10000')# 拉滚动条到底部,加载商品
time.sleep(3)
driver.execute_script('document.documentElement.scrollTop=0')# 随机延迟,等下元素全部刷新
lis = driver.find_elements_by_class_name('gl-item')# 开始提取信息,找到ul标签下的全部li标签
print(lis)
for li in lis:
#商品名字
name = li.find_element_by_xpath('.//div[@class="p-name p-name-type-2"]//em').text
# 去掉“京品手机”
pattern = r"京品手机|\n"
name = re.sub(pattern, "", name, flags=re.S)
#商品链接
phone_url = li.find_elements_by_xpath(".//div[@class='p-name p-name-type-2']/a")[0].get_attribute(
"href")
#商品价格
charge = li.find_element_by_xpath('.//div[@class="p-price"]//i').text
#评论人数
number = li.find_element_by_xpath('.//div[@class="p-commit"]/strong/a').text
#店铺名字
shop_name = li.find_elements_by_xpath('.//div[@class="p-shop"]//a')
if len(shop_name) > 0:
shop_name = shop_name[0].text
else:
shop_name = "null"
#是否自营
tmp = li.find_elements_by_xpath(".//div[@class='p-icons']/i[1]")
phone_proprietary = True if len(tmp) > 0 and tmp[0].text == "自营" else False
#获取手机id
phone_id = li.get_attribute("data-sku")
data_dict = {}
data_dict['phone_id'] = phone_id
data_dict['name'] = name
data_dict['phone_url'] = phone_url
data_dict['charge'] = charge
data_dict['number'] = number
data_dict['shop_name'] = shop_name
data_dict['phone_proprietary'] = phone_proprietary
data_list.append(data_dict)
print(data_dict)
except Exception as e:
print(e)
# 如果count==all_page就退出循环
so = all_page
if count == so:
break
driver.find_element_by_class_name('pn-next').click()
time.sleep(2)
def main():
start_spider()
# 将数据写入jsonwenj
with open('data_json.json', 'a+', encoding='utf-8') as f:
json.dump(data_list, f, ensure_ascii=False, indent=4)
print('json文件写入完成')
with open('data_csv.csv', 'w', encoding='utf-8', newline='') as f:
# 表头
title = data_list[0].keys()
# 声明writer
writer = csv.DictWriter(f, title)
# 写入表头
writer.writeheader()
# 批量写入数据
writer.writerows(data_list)
print('csv文件写入完成')
if __name__ == '__main__':
main()
# 退出浏览器
driver.quit()
总结
多学多练多操作