httpClient get方式抓取数据

2021-07-19 12:05

阅读：881

标签：int protocol .text client 爬取网页 exce 状态 parse 取数

/*
   * 爬取网页信息
   */
   private static String pickData(String url) {
       CloseableHttpClient httpclient = HttpClients.createDefault();
       try {
           HttpGet httpget = new HttpGet(url);
           CloseableHttpResponse response = httpclient.execute(httpget);
           try {
               // 获取响应实体
               HttpEntity entity = response.getEntity();
               // 打印响应状态
               if (entity != null) {
                   InputStream in = entity.getContent();
                   // byte[] b=new byte[in.available()];
                   // in.read(b);
                   BufferedReader br = new BufferedReader(new InputStreamReader(in, "gbk"));
                   String temp = "";
                   String s = "";
                   while ((temp = br.readLine()) != null) {
                       s = s + temp;
                   }
                   return s;
               } else {
                   String content = "热门综艺节目抓取失败,请检查";
                   ErrorLog el = new ErrorLog();
                   Remind remind = new Remind();
                   remind.remind(el.getVerietyLog(), content);
                   return null;
               }
           } finally {
               response.close();
           }
       } catch (ClientProtocolException e) {
           e.printStackTrace();
       } catch (ParseException e) {
           e.printStackTrace();
       } catch (IOException e) {
           e.printStackTrace();
       } finally {
           // 关闭连接,释放资源
           try {
               httpclient.close();
           } catch (IOException e) {
               e.printStackTrace();
           }
       }
       return null;
   }

   /*
   * 使用jsoup解析网页信息
   */
   private static Variety analyzeHTMLByString(String html) {
       Variety v = new Variety();
       String[] arr = new String[3];
       Document document = Jsoup.parse(html);
       // document.select("meta").attr("charset", "utf-8");
       // System.out.println(document);
       Elements array = document.getElementsByClass("keyword");
       System.out.println(array.size());
       String content = "热门综艺节目抓取失败,请检查";
       ErrorLog el = new ErrorLog();
       if (array.size() == 0) {
           Remind remind = new Remind();
           remind.remind(el.getVerietyLog(), content);
           return null;
       }else{
           if (array.size() >= 3) {
               for (int i = 0; i                    String name = array.get(i).child(0).text();
                   arr[i] = name;
               }
           } else {
               for (int i = 0; i                    String name = array.get(i).child(0).text();
                   arr[i] = name;
               }
           }
           v.setHot1(arr[0]);
           v.setHot2(arr[1]);
           v.setHot3(arr[2]);
           return v;
       }

   }

httpClient get方式抓取数据

标签：int protocol .text client 爬取网页 exce 状态 parse 取数

原文地址：http://www.cnblogs.com/lixiuming521125/p/7058577.html

上一篇：HTML DOM 属性

下一篇：【HTTP】另类的POST头数据 RFC1867协议格式简析

文章来自：搜素材网的编程语言模块，转载请注明文章出处。
文章标题：httpClient get方式抓取数据
文章链接：http://soscw.com/essay/106306.html

亲，登录后才可以留言！

httpClient get方式抓取数据

评论

热门文章

推荐文章

最新文章

置顶文章