基于JSoup的HTML解析(以58同城为例)
House.java
package com.newer.spider;
public class House {
String room;
String des;
String money;
String jjr;
public House() {
}
public String getRoom() {
return room;
}
public void setRoom(String room) {
this.room = room;
}
public String getDes() {
return des;
}
public void setDes(String des) {
this.des = des;
}
public String getMoney() {
return money;
}
public void setMoney(String money) {
this.money = money;
}
public String getJjr() {
return jjr;
}
public void setJjr(String jjr) {
this.jjr = jjr;
}
@Override
public String toString() {
return String.format("%s,%s\n",des,jjr);
}
}
Spider.java
package com.newer.spider;
import java.io.IOException;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Spider {
String url;
List<House> list;
public Spider(String url,List<House> list) {
this.url=url;
this.list=list;
}
public void run() {
// 使用Jsoup获得目标页面
// DOM定义的树形结构
try {
Document document=Jsoup.connect(url).get();
// 访问文档中的特定节点
// a <a>
// .abc class="abc"
// #main id="main"
// 抽丝剥茧
Elements es=document.select(".house-list .house-cell");
System.out.println(es.size());
for (Element e : es) {
// System.out.println(e.text());
String room=e.selectFirst(".room").text();
String des=e.selectFirst(".des").text();
String money= e.selectFirst(".money").text();
String jjr=e.selectFirst(".jjr").text();
// System.out.printf("%s.%s,%s,%s\n",room,money,jjr,des);
House house=new House();
house.setDes(des);
house.setRoom(room);
house.setMoney(money);
house.setJjr(jjr);
System.out.println(house.toString());
list.add(house);
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
App.java
package com.newer.spider;
import java.util.ArrayList;
import java.util.List;
public class App {
public static void main(String[] args) {
List<House> list=new ArrayList<>();
// 创建一个爬虫,指定了任务
Spider spider=new Spider("https://cs.58.com/chuzu/?PGTID=0d200001-0019-e299-bb21-2ff05b0bc4e2&ClickID=1", list);
// 让爬虫开始工作
spider.run();
}
}
控制台输出:
基于线程池的并发编程(以招聘网为例)
Spider.java
package com.newer.spider2;
public class Spider implements Runnable {
String url;
public Spider(String url) {
this.url=url;
}
@Override
public void run() {
System.out.printf("%s处理:%s\n",Thread.currentThread().getName(),url);
try {
Thread.sleep(5000);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
App.java
package com.newer.spider2;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
public class App {
public static void main(String[] args) {
// 创建一个线程池,数量不限,循环使用
ExecutorService pool= Executors.newCachedThreadPool();
// 固定大小的线程池
pool=Executors.newFixedThreadPool(16);
// 单一线程池
// pool=Executors.newSingleThreadExecutor();
// 16个线程去执行30个任务
for(int i=0;i<100;i++) {
String url=String.format("https://www/lagou.com/zhaopin/Java/%d/",i+1);
pool.execute(new Spider(url));
}
}
}
控制台输出:
以上就是 利用爬虫快速获取企业招聘信息,有问题的小伙伴,欢迎留言!!!