-
Notifications
You must be signed in to change notification settings - Fork 681
/
DefaultRedisQueueEG.java
47 lines (43 loc) · 1.76 KB
/
DefaultRedisQueueEG.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
package cn.wanghaomiao.seimi.crawlers;
import cn.wanghaomiao.seimi.annotation.Crawler;
import cn.wanghaomiao.seimi.def.BaseSeimiCrawler;
import cn.wanghaomiao.seimi.def.DefaultRedisQueue;
import cn.wanghaomiao.seimi.struct.Request;
import cn.wanghaomiao.seimi.struct.Response;
import org.seimicrawler.xpath.JXDocument;
import java.util.List;
/**
* 使用seimicrawler提供的默认redis队列实现,独立启动需通过 {@link cn.wanghaomiao.seimi.config.SeimiConfig} 配置redis信息,
* spring boot 请自行注册 {@link org.redisson.Redisson} bean ,相关配置请参考 https://github.com/redisson/redisson/wiki/2.-%E9%85%8D%E7%BD%AE%E6%96%B9%E6%B3%95
* @author github.com/zhegexiaohuozi seimimaster@gmail.com
* @since 2015/10/21.
*/
//@Crawler(name = "DefRedis",queue = DefaultRedisQueue.class,useUnrepeated = false)
public class DefaultRedisQueueEG extends BaseSeimiCrawler {
@Override
public String[] startUrls() {
return new String[]{"http://www.cnblogs.com/"};
}
@Override
public void start(Response response) {
JXDocument doc = response.document();
try {
List<Object> urls = doc.sel("//a[@class='titlelnk']/@href");
logger.info("{}", urls.size());
for (Object s:urls){
push(Request.build(s.toString(),DefaultRedisQueueEG::getTitle));
}
} catch (Exception e) {
e.printStackTrace();
}
}
public void getTitle(Response response){
JXDocument doc = response.document();
try {
logger.info("url:{} {}", response.getUrl(), doc.sel("//h1[@class='postTitle']/a/text()|//a[@id='cb_post_title_url']/text()"));
//do something
} catch (Exception e) {
e.printStackTrace();
}
}
}