Java|“ Java”来爬取小说章节

1 前提简介

在上一篇Java|使用WebMagic进行电话爬取“的文章里就已经介绍了如何主要使用Pageprocessor去爬取电话号码，接下来将要学习到的是去爬取起点中文网的小说，并且按照小说名和章节分别保存。

2 简单查看

下面就是需要去爬取的小说页面以及内容，但保存下来的文件只需要章节内容，像第一章的开头就不需要，于是需要注意去判断。

图2.1 起点中文网

图2.2 玄幻新书

图2.3 反派强无敌

图2.4 章节内容

3 代码及注释

话不多说，需要的解释都以注释的形式写在代码里，下面就来看看详细的代码，值得注意的是内容的是xpath不要写错，否则可能会导致失败：

package com.yellow.java_pachong.book;

import us.codecraft.webmagic.Page;

import us.codecraft.webmagic.Site;

import us.codecraft.webmagic.Spider;

import us.codecraft.webmagic.processor.PageProcessor;

import us.codecraft.webmagic.selector.Html;

import us.codecraft.webmagic.selector.Selectable;

import java.io.File;

import java.io.FileNotFoundException;

import java.io.FileOutputStream;

import java.io.PrintWriter;

import java.util.ArrayList;

import java.util.List;

/**

* 爬取起点小说

public class GetQidianBook implements PageProcessor {

//设置带爬取文件的相关配置

private Site site = Site.me()

.setCharset("utf-8")//设置字符集

.setTimeOut(1000)//设置超时时间

.setSleepTime(1000);//设置休眠时间

//书的名字

String bookName1 = "";

@Override

public Site getSite() {return site;}

//爬取数据逻辑

//第一级URL https://www.qidian.com/xuanhuan 获取书栏目录

//第二级 https://book.qidian.com/info/1019251979#Catalog 章节目录

//第三级 https://read.qidian.com/chapter/SaT8jsiJD54smgY_yC2imA2/oQbX6YtwB_NOBDFlr9quQA2 章节内容

@Override

public void process(Page page) {

//获取URL

Selectable table = page.getUrl();

//System.out.println(table);

//URL匹配用.{23}去代替字符匹配，每个章节的后缀不一样

if (table.regex("https://read.qidian.com/chapter/.{23}/.{23}").match()) {//文章章节页面

//获取html页面信息

Html html = page.getHtml();

//打印html

//System.out.println(html);

//章节标题

String title = "";

//内容集合

List<String> content = new ArrayList<String>();

//抓取有用信息

//判断是否是第一章

if (html.xpath("/html/body/div[2]/div[3]/div[2]/div[1]/div[1]/div[1]/div[1]/h1/text()").toString() != null) {//是第一章

//获取书名

bookName1 = html.xpath("/html/body/div[2]/div[3]/div[2]/div[1]/div[1]/div[1]/div[1]/h1/text()").toString();

//System.out.println(bookName);

//获取章节名

title = html.xpath("[@class='main-text-wrap']/div[1]/h3/span/text()").toString();

//System.out.println(title);

//获取文章内容

content = html.xpath("[@class='main-text-wrap']/div[2]/p/text()").all();

} else {//不是第一章

//获取章节名

title = html.xpath("[@id='j_chapterBox']/div[1]/div[1]/div[1]/h3/span/text()").toString();

//获取文章内容

content = html.xpath("[@id='j_chapterBox']/div[1]/div[1]/div[2]/p/text()").all();

}

//存到本地

downBook(bookName1, title, content);

}else if(table.regex("https://book.qidian.com/info/\\d{10}#Catalog").match()){//书的章节目录

//获取每一章节的地址,在章节目录里每一章的xpath

List<String> url = page.getHtml().xpath("[@class='volume-wrap']/div[1]/ul/li/a/@href").all();

//加入待爬取序列

page.addTargetRequests(url);

}else{//一级url

//获取Html页面

Html html = page.getHtml();

//解析出每本书的url

List<String> url = html.xpath("[@id='new-book-list']/div/ul/li/div[2]/h4/a/@href").all();

//拼接成完整的路径

List<String> url2 = new ArrayList<String>();

for (String string : url) {

url2.add(string + "#Catalog");

}

//加入待爬取序列

page.addTargetRequests(url2);

}

//将书存入本地

private void downBook(String bookName2, String title, List<String> content) {

//判断目录存不存在

File file = new File("D:/book.xuanhuan/" + bookName2);

if(!file.exists()){

file.mkdirs();

}

PrintWriter pw = null; //使用IO流

try {

//存为txt文件及其路径

FileOutputStream fos = new FileOutputStream("D:/book.xuanhuan/" + bookName2 + "/" + title + ".txt");

pw = new PrintWriter(fos,true);

for (String string : content) {

pw.println(string);

}

//爬完一章打印

System.out.println(title + " " + "爬取完毕");

} catch (FileNotFoundException e) {

e.printStackTrace();

} finally {//关流

pw.close();

}

//创建线程

public static void main(String[] args) {//爬取了玄幻类的书

Spider.create(new GetQidianBook()).thread(1).addUrl("https://www.qidian.com/xuanhuan").run();

}

4结果展示

首先是控制台的的打印：

图4.1 控制台打印

然后是保存文件路径：

图4.2 文件路径

最后是章节内容：

图4.3 章节内容

这样就自动规整地爬取到了书籍。

END主编 | 张祯悦

责编 | 黄晓锋

where2go 团队

微信号：算法与编程之美

本站仅提供存储服务，所有内容均由用户发布，如发现有害或侵权内容，请点击举报。