优化following-sibling
following
preceding-sibling
preceding
行为,以便更好的适配文本提取场景,如下:
@Test
public void issue64And65(){
String content = "<div class='a'>1</div>" +
"<div>2</div>\n" +
"<div class='a'>3</div>\n" +
"<div>4</div>\n" +
"<div>5</div>11" +
"<tag>6</tag>12" +
"<div>7<span>8</span></div>" +
"";
JXDocument j = JXDocument.create(content);
Assert.assertEquals("7", j.selNOne("//div[text()='5']/following-sibling::div/text()").asString());
Assert.assertEquals("6", j.selNOne("//div[text()='5']/following-sibling::tag/text()").asString());
Assert.assertEquals("11", j.selNOne("//div[text()='5']/following-sibling::text()").asString());
Assert.assertEquals("12", j.selNOne("//div[text()='7']/preceding-sibling::text()").asString());
Assert.assertEquals("5", j.selNOne("//div[text()='7']/preceding-sibling::div/text()").asString());
Assert.assertEquals("6", j.selNOne("//div[text()='7']/preceding-sibling::tag/text()").asString());
Assert.assertEquals("6", j.selNOne("//div[text()='7']/preceding-sibling::tag/text()").asString());
Assert.assertEquals("11 6 12 7 8", j.selN("//div[text()='5']/following::text()").stream().map(Objects::toString).collect(Collectors.joining(" ")).trim());
Assert.assertEquals("6", j.selN("//div[text()='5']/following::tag/text()").stream().map(Objects::toString).collect(Collectors.joining(" ")).trim());
Assert.assertEquals("8", j.selN("//div[text()='5']/following::span/text()").stream().map(Objects::toString).collect(Collectors.joining(" ")).trim());
Assert.assertEquals("5 7", j.selN("//div[text()='4']/following::div/text()").stream().map(Objects::toString).collect(Collectors.joining(" ")).trim());
Assert.assertEquals("2 1", j.selN("//div[text()='3']/preceding::text()").stream().map(Objects::toString).collect(Collectors.joining(" ")).trim());
Assert.assertEquals("3 2 1", j.selN("//div[text()='4']/preceding::text()").stream().map(Objects::toString).collect(Collectors.joining(" ")).trim());
}
以及豆瓣详情页提取测试:
@Test
public void testDoubanDetailInfoExtra() throws Exception{
JXDocument doc = createFromResource("d_detail_page.html");
JXNode score = doc.selNOne("//*[@id=\"interest_sectl\"]/div/div[2]/strong/text()");
logger.info("{}", score.asString());
JXNode title = doc.selNOne("//*[@id=\"wrapper\"]/h1/span/text()");
logger.info("{}", title.asString());
JXNode pageNum = doc.selNOne("//*[@id=\"info\"]/span[contains(text(),'页数')]/following-sibling::text()");
logger.info("{}", pageNum.asString());
Assert.assertEquals("956", pageNum.asString());
JXNode price = doc.selNOne("//*[@id=\"info\"]/span[contains(text(),'定价')]/following-sibling::text()");
logger.info("{}", price.asString());
Assert.assertEquals("139.00元", price.asString());
}