最近这几天学习Android 的抓取网页新闻
拜读了这位大牛的文章:http://blog.csdn.net/lmj623565791/article/details/23532797
参考这位大牛的源码试着自己书写了一下结果惨不忍睹,由于是学习我也就忍了
虽然失败了,但是东西还是学习到了(失败原因是因为csdn对网页做了处理 我只能抓取到废页面)
首先 读取网页源码
1 public String doGet(String arg0,final int newsType) throws Exception{ 2 //可变比特缓存 3 final ByteArrayOutputStream out = new ByteArrayOutputStream(); 4 //地址URL 5 URL url=null; 6 //比特流 7 try { 8 url= new URL(arg0); 9 //打开网络链接10 HttpURLConnection conn = (HttpURLConnection) url.openConnection();11 //使用get方式读取数据12 conn.setRequestMethod("GET");13 //设置超时5秒14 conn.setConnectTimeout(5*1000);15 //设置允许读写POST请求必须输入这两行16 conn.setDoInput(true);17 conn.setDoOutput(true);18 int flag= conn.getResponseCode();19 if(flag==HttpURLConnection.HTTP_OK){//如果返回200 则说明请求成功20 //取得输入流21 InputStream in = conn.getInputStream();22 byte[] data = readStream(in);23 //新建一个线程用于存储二进制数据进入文件24 // new Thread(){25 // public void run(){26 // ByteArrayOutputStreamWrite(in,newsType);27 // }28 // }.start();29 String html = new String(data);30 return html;31 }else{32 throw new Exception("访问网络失败!");33 }34 } catch (MalformedURLException e) {35 throw new Exception("解析网络地址失败!");36 } catch (IOException e) {37 throw new Exception("读取数据失败!");38 }39 }40 41 42 43 private byte[] readStream(InputStream in) throws IOException {44 ByteArrayOutputStream out =new ByteArrayOutputStream(); 45 //把流存入缓存中46 int len = -1;47 byte[] bs = new byte[1024];48 while((len=in.read(bs))!=-1){49 out.write(bs, 0, len);50 }51 in.close();52 out.close();53 return out.toByteArray();54 }
接着 解析源代码从中抽取有用信息
1 /** 2 * 网页解析 3 * @param data 网页源代码 4 * @return 数据数组 5 * @throws IOException 6 */ 7 public List<Data> dataFilter(String data,int newsType) throws IOException { 8 List<Data> list = new ArrayList<Data>(); 9 Document doc= Jsoup.parse(data);10 Elements units = doc.getElementsByClass("unit");11 for(int i=0;i<units.size();i++){12 Data thisdata=new Data();13 Element unit = units.get(i);14 //取得h1标签的内容15 Element h1_ele=unit.getElementsByTag("h1").get(0);16 //取得h1中<a>标签的内容17 Element h1_a_ele = h1_ele.child(0);18 //转换成文本19 String title = h1_a_ele.text();20 //取得<a>的harf属性作为链接21 String url = h1_a_ele.attr("href");22 23 Element h4_ele=unit.getElementsByTag("h4").get(0);24 Element h4_time_ele = h4_ele.child(0);25 String date=h4_time_ele.text();26 Element dl_ele = unit.getElementsByTag("dl").get(0);27 Element dt_ele = dl_ele.child(0);28 //获取图片地址和图片链接29 try{30 String imgUrl = dt_ele.child(1).attr("src");31 String imgLinkUrl = dt_ele.child(0).attr("href");32 thisdata.setImgLinkUrl(imgLinkUrl);33 thisdata.setImgUrl(imgUrl);34 }catch(Exception e){35 36 }37 Element dd_ele = dl_ele.child(1);38 String content = dd_ele.text();39 //放入data对象40 thisdata.setContent(content);41 thisdata.setTitle(title);42 thisdata.setUrl(url);43 thisdata.setDate(date);44 thisdata.setType(newsType);45 46 list.add(thisdata);47 }48 return list;49 }
联系客服