抓取网页链接
2020-11-26 23:25
标签:package import public 网页 package com.smilezl.scrapy; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; import java.sql.Connection; import java.sql.DriverManager; import java.sql.Statement; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class ScrapyUrl {
/** * 解析网页链接 * @param htmlUrl * @throws IOException */ public static List List
try { URL url = new URL(htmlUrl); HttpURLConnection connection = (HttpURLConnection) url.openConnection(); connection.setDoOutput(true);
String contenttype = connection.getContentType(); String charSet = getCharset(contenttype); if (charSet == null) charSet = "UTF-8"; InputStreamReader isr = new InputStreamReader(connection.getInputStream(), charSet); BufferedReader br = new BufferedReader(isr);
String str = null, rs = null; while ((str = br.readLine()) != null) { rs = getHref(str, htmlUrl); if (rs != null && !list.contains(rs)) list.add(rs); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); }
return list; }
/** * 获取网页编码方式 * @param str * @return */ public static String getCharset(String str) { Pattern pattern = Pattern.compile("charset=.*"); Matcher matcher = pattern.matcher(str); if (matcher.find()) { return matcher.group(0).split("charset=")[1]; } return null; }
/** * 从一行字符串中读取链接 * @param str * @return */ public static String getHref(String str, String htmlUrl) { String patternStr = "(http://|https://){1}[\\w\\.\\-/:]+";
//String patternStr = "[^\\s]*((]+\\s*)>)(.*)[aA]>).*"; Pattern pattern = Pattern.compile(patternStr); Matcher matcher = pattern.matcher(str); if (matcher.find()){ return matcher.group(0); } else { //相对位置截取 String RelPatternStr = "href=\"/.*(html){1}"; pattern = Pattern.compile(RelPatternStr); matcher = pattern.matcher(str); if (matcher.find()) { return matcher.group(0).replace("href=\"/", htmlUrl); } }
return null; }
/** * 保存链接 * @param url */ public static void saveUrlList(String hrefurl) { try { Class.forName("org.postgresql.Driver").newInstance(); String url = "jdbc:postgresql://localhost:5432/mydb?useUnicode=true&characterEncoding=gbk"; Connection con = DriverManager.getConnection(url, "postgres", "password"); Statement st = con.createStatement(); List for (int i = 0; i String sql = "insert into scrapyurl(url,type) values(‘" + list.get(i) + "‘,0)"; System.out.println(list.get(i)); st.execute(sql); } st.close(); con.close(); } catch (Exception e) { e.printStackTrace(); } }
public static void main(String[] args) { saveUrlList("http://fo.ifeng.com/fojiaomeiwen/list_0/0.shtml"); } } 抓取网页链接,搜素材,soscw.com 抓取网页链接 标签:package import public 网页 原文地址:http://smilezhuolin.blog.51cto.com/7671611/1405966