Java开发
Jsoup的使用教程(2)-设置头信息
2025-01-22 30 0
简介 Jsoup的使用说明二-设置头信息
在网络爬虫中,经常需要设置一些头信息。设置头信息的作用是伪装网络爬虫, 使得网络爬虫请求网页更像浏览器访问网页,
进而降低了网络爬虫被网站封锁的风险。 Jsoup 中提供了两种设置头信息的方法,如下所示。
Connection header(String name, String value);
Connection headers(Map<String,String> headers);
1.单个设置
第一种方法每次只可以设置一个请求头,如果要设置多个请求头,需要多次调用 此方法;
public class JsoupConnectHeader {
public static void main(String[] args) throws IOException {
Connection connect = Jsoup.connect("http://www.baidu.com");
//设置一个请求头
Connection conheader = connect.header("User-Agent", "Mozilla 5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36");
Document document = conheader.get();
System.out.println(document);
}
}
2.批量设置
第二种方法可以添加多个请求头至 Map 集合。
public class JsoupConnectHeaderMap {
public static void main(String[] args) throws IOException {
Connection connect = Jsoup.connect("http://www.baidu.com");
//设置多个请求头
Map<String, String> header = new HashMap<>();
header.put("Host", "www.********.com.cn"); header.put("User-Agent", " Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239. 108 Safari/537.36");
header.put("Accept", "text/html,application/xhtml+xml, application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
header.put("Accept-Language", "zh-cn,zh;q=0.5");
header.put("Accept-Encoding", "gzip, deflate");
header.put("Cache-Control", "max-age=0");
header.put("Connection", "keep-alive");
Connection conheader = connect.headers(header);
Document document = conheader.get();
System.out.println(document);
}
}
3.添加User-Agent库和Referer库
public class JsoupConnectHeaderList {
public static void main(String[] args) throws IOException {
Connection connect = Jsoup.connect("http://www.baidu.com");
//实例化静态类
Builder builder = new Builder(); //请求网页添加不同Host,也可以不设置
builder.host = "www.********.com.cn"; //将Builder中的信息添加到Map集合中
Map<String, String> header = new HashMap<String, String>(); header.put("Host", builder.host);
header.put("User-Agent", builder.userAgentList.get(new Random().nextInt(builder.userAgentSize)) );
header.put("Accept", builder.accept);
header.put("Referer", builder.refererList.get(new Random().nextInt(builder.refererSize)));
header.put("Accept-Language", builder.acceptLanguage); header.put("Accept-Encoding", builder.acceptEncoding);
//设置头
Connection conheader = connect.headers(header);
Document document = conheader.get(); //发送GET请求
System.out.println(document); //输出HTML
}
/**
* 封装请求头信息的静态类 */
static class Builder{
//设置User-Agent库;根据需求添加更多User-Agent
String[] userAgentStrs = {"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)"};
List<String> userAgentList = Arrays.asList(userAgentStrs);
int userAgentSize = userAgentList.size();
//设置Referer库;根据需求添加更多Referer
String[] refererStrs = {"https://www.*****.com/",
"https://www.*****.com/",
"http://www.****.com",
"https://www.**.com/"};
List<String> refererList = Arrays.asList(refererStrs);
int refererSize = refererList.size();
//设置accept、accept-Language及accept-Encoding
String accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8";
String acceptLanguage = "zh-cn,zh;q=0.5";
String acceptEncoding = "gzip, deflate";
String host;
}
}