java 抓取网页图片
2020-11-20 22:54
标签:des blog http java color os
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import
java.util.Iterator;
import
java.util.List;
import
java.util.UUID;
import
java.util.regex.Matcher;
import
java.util.regex.Pattern;
/*** * java抓取网络图片
*
* @author swinglife
*
*/
public
class CatchImage
{ // 地址
private
static final String URL = "http://www.4493.com/";
// 编码
private
static final String ECODING = "UTF-8";
// 获取img标签正则
private
static final String IMGURL_REG = ";
// 获取src路径的正则
private
static final String IMGSRC_REG = "http:\"?(.*?)(\"|>|\\s+)";
private
static final String IMGDSRC_REG = "[\"\‘](http.+\\.(jpg|JPG|png|PNG|gif|GIF))[\"\‘]";
private
static final String[] picstuffix = { "jpg", "JPG", "gif", "GIF", "png", "PNG"
};
private
static Listnew
ArrayList();
public
static void main(String[] args) throws
Exception
{
CatchImage cm = new
CatchImage();
// 获得html文本内容
String HTML = cm.getHTML(URL);
// System.out.println(HTML);
// 获取图片标签
List
// 获取图片src地址
List
// 下载图片
cm.Download(imgSrc, "E:\\Imagesave"+saveDiff());
// cm.getImageSrc(HTML); // cm.ThreadDownload(imgSrc, "E:\\Imagesave"+saveDiff() , 6);
// cm.TOThreadDownload(pList, "E:\\Imagesave" + saveDiff(), 6, 6000); }
/***
* 获取HTML内容
*
* @param url
* @return
* @throws Exception
*/
private
String getHTML(String url) throws
Exception
{
URL uri = new
URL(url);
URLConnection connection = uri.openConnection();
InputStream in = connection.getInputStream();
byte[] buf = new
byte[1024];
int
length = 0;
StringBuffer sb = new
StringBuffer();
while
((length = in.read(buf, 0, buf.length)) > 0)
{
sb.append(new
String(buf, ECODING));
}
in.close();
return
sb.toString();
}
/***
* 获取ImageUrl地址
*
* @param HTML
* @return
*/
private
List
{
Matcher matcher = Pattern.compile(IMGURL_REG).matcher(HTML);
Listnew
ArrayList
while
(matcher.find())
{
listImgUrl.add(matcher.group());
}
return
listImgUrl;
}
/***
* 获取ImageSrc地址
*
* @param listImageUrl
* @return
*/
private
List
{
Listnew
ArrayList
for
(String image : listImageUrl)
{
Matcher matcher = Pattern.compile(IMGSRC_REG).matcher(image);
while
(matcher.find())
{
listImgSrc.add(matcher.group().substring(0, matcher.group().length() - 1));
}
}
return
listImgSrc;
}
/**
* 获取html里面的图片
*
* @param html
* @return
*/
private
void getImageSrc(String html)
{
Pattern p = Pattern.compile(IMGDSRC_REG);
Matcher m = p.matcher(html);
while
(m.find())
{
getpicsrc(m.group(1));
}
}
/**
* 截取字符串里面的图片
*
* @param src
* @return
*/
public
void getpicsrc(String src)
{
if
(src.contains("http:"))
{
String[] app = src.split("http:");
for
(int i = 0; i
{
if
(!isBlank(app[i]))
{
for
(int j = 0; j
{
if
(app[i].contains("."
+ picstuffix[j]))
{
int
inum = app[i].indexOf(picstuffix[j]);
String url = "http:"
+ app[i].substring(0, inum) + picstuffix[j];
pList.add(url);
}
}
}
}
}
}
/**
* 去处重复元素
*
* @param result
* @return
*/
public
static List
{
Listnew
ArrayList
for
(int i = 0; i
{
if
(!tmpArr.contains(result.get(i)))
{
tmpArr.add((String) result.get(i));
}
}
return
tmpArr;
}
/**
* 判断非空
*
* @param cs
* @return
*/
public
static boolean isBlank(CharSequence cs)
{
int
strLen;
if
(cs == null
|| (strLen = cs.length()) == 0)
{
return
true;
}
for
(int i = 0; i
{
if
(Character.isWhitespace(cs.charAt(i)) == false)
{
return
false;
}
}
return
true;
}
/***
* 单线程下载图片
*
* @param listImgSrc
*/
private
void Download(List
{
for
(String url : listImgSrc)
{
try
{
String imageName = url.substring(url.lastIndexOf("/") + 1, url.length());
URL uri = new
URL(url);
InputStream in = uri.openStream();
FileOutputStream fo = new
FileOutputStream(new
File(savedir + imageName));
byte[] buf = new
byte[1024];
int
length = 0;
while
((length = in.read(buf, 0, buf.length)) != -1)
{
fo.write(buf, 0, length);
}
in.close();
fo.close();
System.out.println("*^_^*");
} catch
(Exception e)
{
System.out.println("-_-!");
}
}
}
/**
* 多线程下载图片
*
* @param listImgSrc
* @param savedir
* @param tnum
*/
private
void ThreadDownload(Listint
tnum)
{
for
(int i = 0; i
{
new
DThread(savedir, tnum, listImgSrc, i).start();
}
}
/**
*
* 2014-4-3上午10:52:38 Describe: 多线程下载照片
*
* @author: ITWANG
*/
class
DThread extends
Thread
{
private
String savedir = null;
private
int tnum;
private
List
private
int bunm;
public
DThread(String savedir, int
tnum, Listint
bnum)
{
this.savedir = savedir;
this.tnum = tnum;
this.listImgSrc = listImgSrc;
this.bunm = bnum;
}
@Override
public
void run()
{
for
(int i = 0; i
{
try
{
String url = listImgSrc.get(bunm + i);
String sps = url.substring(url.lastIndexOf("."), url.length());
String imageName = UUID.randomUUID().toString() + sps;
URL uri = new
URL(url);
InputStream in = uri.openStream();
System.out.println(savedir + imageName);
FileOutputStream fo = new
FileOutputStream(new
File(savedir + imageName));
byte[] buf = new
byte[1024];
int
length = 0;
while
((length = in.read(buf, 0, buf.length)) != -1)
{
fo.write(buf, 0, length);
}
in.close();
fo.close();
System.out.println("*^_^*");
} catch
(Exception e)
{
System.out.println("-_-!");
}
}
}
}
/**
* 多线程超时下载
*
* @param listImgSrc
* @param savedir
* @param tnum
* @param timeout
*/
private
void TOThreadDownload(Listint
tnum, int
timeout)
{
for
(int i = 0; i
{
new
TODThread(savedir, tnum, listImgSrc, i, timeout).start();
}
}
/**
*
* 2014-4-3上午10:52:07 Describe: 超时方式下载照片线程
*
* @author: ITWANG
*/
class
TODThread extends
Thread
{
private
String savedir = null;
private
int tnum;
private
List
private
int bunm;
private
int timeout = 3000;
public
TODThread(String savedir, int
tnum, Listint
bnum, int
timeout)
{
this.savedir = savedir;
this.tnum = tnum;
this.listImgSrc = listImgSrc;
this.bunm = bnum;
this.timeout = timeout;
}
@Override
public
void run()
{
for
(int i = 0; i
{
String url = listImgSrc.get(bunm + i);
String sps = url.substring(url.lastIndexOf("."), url.length());
String imageName = UUID.randomUUID().toString() + sps;
try
{
if
(getPic(url, savedir, imageName, timeout))
{
System.out.println("*^_^*");
} else
{
System.out.println("-_-!");
}
} catch
(Exception e)
{
System.out.println("下载异常");
}
}
}
}
/**
* GET方式下载照片
&