Android 学习心得(7)

最近这几天学习Android 的抓取网页新闻

拜读了这位大牛的文章：http://blog.csdn.net/lmj623565791/article/details/23532797

参考这位大牛的源码试着自己书写了一下结果惨不忍睹，由于是学习我也就忍了

虽然失败了，但是东西还是学习到了（失败原因是因为csdn对网页做了处理我只能抓取到废页面）

首先读取网页源码

 1 public String doGet(String arg0,final int newsType) throws Exception{ 2         //可变比特缓存 3         final ByteArrayOutputStream out = new ByteArrayOutputStream(); 4         //地址URL 5         URL url=null; 6         //比特流 7         try { 8             url= new URL(arg0); 9             //打开网络链接10             HttpURLConnection conn = (HttpURLConnection) url.openConnection();11             //使用get方式读取数据12             conn.setRequestMethod("GET");13             //设置超时5秒14             conn.setConnectTimeout(5*1000);15             //设置允许读写POST请求必须输入这两行16             conn.setDoInput(true);17             conn.setDoOutput(true);18             int flag= conn.getResponseCode();19             if(flag==HttpURLConnection.HTTP_OK){//如果返回200 则说明请求成功20                 //取得输入流21                 InputStream in = conn.getInputStream();22                 byte[] data = readStream(in);23                 //新建一个线程用于存储二进制数据进入文件24 //                new Thread(){25 //                    public void run(){26 //                        ByteArrayOutputStreamWrite(in,newsType);27 //                    }28 //                }.start();29                 String html = new String(data);30                 return html;31             }else{32                  throw new Exception("访问网络失败！");33             }34         } catch (MalformedURLException e) {35             throw new Exception("解析网络地址失败！");36         } catch (IOException e) {37             throw new Exception("读取数据失败！");38         }39     }40 41 42     43     private byte[] readStream(InputStream in) throws IOException {44         ByteArrayOutputStream out  =new ByteArrayOutputStream(); 45         //把流存入缓存中46         int len = -1;47         byte[] bs = new byte[1024];48         while((len=in.read(bs))!=-1){49             out.write(bs, 0, len);50         }51         in.close();52         out.close();53         return out.toByteArray();54     }

接着解析源代码从中抽取有用信息

 1 /** 2      * 网页解析 3      * @param data 网页源代码 4      * @return 数据数组 5      * @throws IOException  6      */ 7     public  List<Data> dataFilter(String data,int newsType) throws IOException { 8         List<Data> list = new ArrayList<Data>(); 9         Document doc=  Jsoup.parse(data);10         Elements units = doc.getElementsByClass("unit");11         for(int i=0;i<units.size();i++){12             Data thisdata=new Data();13             Element unit = units.get(i);14             //取得h1标签的内容15             Element h1_ele=unit.getElementsByTag("h1").get(0);16             //取得h1中<a>标签的内容17             Element h1_a_ele = h1_ele.child(0);18             //转换成文本19             String title = h1_a_ele.text();20             //取得<a>的harf属性作为链接21             String url = h1_a_ele.attr("href");22             23             Element h4_ele=unit.getElementsByTag("h4").get(0);24             Element h4_time_ele = h4_ele.child(0);25             String date=h4_time_ele.text();26             Element dl_ele = unit.getElementsByTag("dl").get(0);27             Element dt_ele = dl_ele.child(0);28             //获取图片地址和图片链接29             try{30             String imgUrl = dt_ele.child(1).attr("src");31             String imgLinkUrl = dt_ele.child(0).attr("href");32             thisdata.setImgLinkUrl(imgLinkUrl);33             thisdata.setImgUrl(imgUrl);34             }catch(Exception e){35                 36             }37             Element dd_ele = dl_ele.child(1);38             String content = dd_ele.text();39             //放入data对象40             thisdata.setContent(content);41             thisdata.setTitle(title);42             thisdata.setUrl(url);43             thisdata.setDate(date);44             thisdata.setType(newsType);45             46             list.add(thisdata);47         }48         return list;49     }

本站仅提供存储服务，所有内容均由用户发布，如发现有害或侵权内容，请点击举报。