大学IT网 - 最懂大学生的IT学习网站! QQ资料交流群:367606806
当前位置:大学IT网 > Java技巧 > HttpClient拉取连载小说

HttpClient拉取连载小说

关键词:连载小说HttpClient  阅读(765) 赞(19)

[摘要]上午刚入手的小说,下午心血来潮想从网站上拉取下来做成电子书,呵呵,瞎折腾~说做就做~

上午刚入手的小说,下午心血来潮想从网站上拉取下来做成电子书,呵呵,瞎折腾~说做就做~

【抓包】

这一步比什么都重要,如果找不到获取真正资源的那个请求,就什么都不用做了~

先是打算用迅雷把所有页面都下载下来然后本地处理,结果发现保存下来的页面都只有界面没有内容~看了看Javascript的代码,原来是ready的时候再ajax发送post到另一个网址取内容。

于是再抓包核实一下。抓包工具真难搞,试了两三个都没成功,最后还是用firefox搞定了~打开页面共发送了50个请求,不过post只有两个,很快就看到http包的内容了。

【写程序拉取】

网址,请求的header,表单 都具备了,还等什么,赶紧码字爬取啦~本来还担心要伪装浏览器,要填入cookies内容,调试起来发现是想太多了,直奔网址带上表单就够了~

HttpClient的用法是现炒现卖,官方example的QuickStart.java就够很清晰了。再有就是debug进去看请求和响应。

用法见HttpPost和crawlOnePage(HttpRequestBase)。MyFileWriter就不贴出来献丑了,反正就是个I/O。

遇到并解决的问题:

1.响应回来的资源是gzip压缩过的,要用对应的类去解码;

2.网址序列号中个别缺页,通过判断响应里的状态码跳过即可。

(全文完,以下是代码。)

 package mycrawl;
 
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 
 import org.apache.http.HttpEntity;
 import org.apache.http.NameValuePair;
 import org.apache.http.client.ClientProtocolException;
 import org.apache.http.client.entity.GzipDecompressingEntity;
 import org.apache.http.client.entity.UrlEncodedFormEntity;
 import org.apache.http.client.methods.CloseableHttpResponse;
 import org.apache.http.client.methods.HttpPost;
 import org.apache.http.client.methods.HttpRequestBase;
 import org.apache.http.impl.client.CloseableHttpClient;
 import org.apache.http.impl.client.HttpClients;
 import org.apache.http.message.BasicNameValuePair;
 import org.apache.http.util.EntityUtils;
 
 import crawl.common.MyFileWriter;
 
 public class MyCrawl {
 
     private static CloseableHttpClient httpclient = HttpClients.createDefault();
 
     /**
      * (1)建立post对象,包括网址和表单 (2)循环抓取每页并处理输出
      * 
      * @param args
      * @throws IOException
      */
     public static void main(String[] args) throws IOException {
         final int startChapter = 页面序列号;
         final int endChapter = 页面序列号;
final Integer bookId = bookid; String outPattern = "c:\\book*.txt"; MyFileWriter fw = new MyFileWriter(outPattern); // 创建post操作 HttpPost httpPost = new HttpPost("网址"); List<NameValuePair> nvps = new ArrayList<NameValuePair>(2); try { // post的表单内容 nvps.add(new BasicNameValuePair("b", bookId.toString())); nvps.add(new BasicNameValuePair("c", "placeholder")); for (Integer i = startChapter, j = 0; i <= endChapter; i++, j++) { // 循环抓取连续章节 nvps.set(1, new BasicNameValuePair("c", i.toString())); httpPost.setEntity(new UrlEncodedFormEntity(nvps)); String outStr = MyCrawl.crawlOnePage(httpPost); if (outStr == null || outStr.isEmpty()) { j--; continue; } // 处理章节标题,懒得去抓取标题页了。 outStr = "====== " + MyCrawl.chapterArr[j] + "\r\n" + MyCrawl.prettyTxt(outStr); // System.out.println(outStr); fw.rollingAppend(outStr); } fw.getFileWriter().flush(); fw.getFileWriter().close(); System.out.println("已完成"); } finally { httpclient.close(); } } /** * 抓取单页 * * @param req * @return result * @throws ClientProtocolException * @throws IOException */ public static String crawlOnePage(HttpRequestBase req) throws ClientProtocolException, IOException { String result; CloseableHttpResponse resp = httpclient.execute(req); // 处理返回码 int status = resp.getStatusLine().getStatusCode(); if (status < 200 || status >= 300) { System.out.println("[Error] " + resp.getStatusLine().toString()); return ""; } else if (status != 200) { System.out.println("[Warn] " + resp.getStatusLine().toString()); return ""; } HttpEntity entity = resp.getEntity(); if (entity instanceof GzipDecompressingEntity) { // 解压缩内容 GzipDecompressingEntity gEntity = (GzipDecompressingEntity) entity; result = EntityUtils.toString(gEntity); } else { result = EntityUtils.toString(entity); } EntityUtils.consume(entity); resp.close(); return result; } /** * 处理换行等特殊字符 * * @param txt * @return string */ public static String prettyTxt(String txt) { if (txt == null || txt.isEmpty()) { return ""; } int contentStart = txt.indexOf("content") + 10; int contentEnd = txt.indexOf("  <br/><br/>  \",\"next"); txt = txt.substring(contentStart, contentEnd); return txt.replace("<br/><br/>", "\r\n"); } // 章节标题 private static final String[] chapterArr = new String[] { "第一章", "第二章" }; }


相关评论