diff --git a/pom.xml b/pom.xml index 4990fdad3..5755dda2f 100644 --- a/pom.xml +++ b/pom.xml @@ -4,7 +4,7 @@ com.rarchives.ripme ripme jar - 1.0.16 + 1.0.17 ripme http://rip.rarchives.com diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/ChanRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/ChanRipper.java new file mode 100644 index 000000000..71bf73fec --- /dev/null +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/ChanRipper.java @@ -0,0 +1,107 @@ +package com.rarchives.ripme.ripper.rippers; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.HashSet; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.log4j.Logger; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; + +import com.rarchives.ripme.ripper.AbstractRipper; + +public class ChanRipper extends AbstractRipper { + + private static final Logger logger = Logger.getLogger(ChanRipper.class); + + public ChanRipper(URL url) throws IOException { + super(url); + } + + @Override + public String getHost() { + String host = this.url.getHost(); + host = host.substring(0, host.lastIndexOf('.')); + if (host.contains(".")) { + // Host has subdomain (www) + host = host.substring(host.lastIndexOf('.') + 1); + } + String board = this.url.toExternalForm().split("/")[3]; + return host + "_" + board; + } + + @Override + public boolean canRip(URL url) { + // TODO Whitelist? + return url.getHost().contains("chan") && url.toExternalForm().contains("/res/"); + } + + /** + * Reformat given URL into the desired format (all images on single page) + */ + public URL sanitizeURL(URL url) throws MalformedURLException { + return url; + } + + @Override + public String getGID(URL url) throws MalformedURLException { + Pattern p; Matcher m; + + p = Pattern.compile("^.*chan.*\\.[a-z]{2,3}/[a-z]+/res/([0-9]+)(\\.html|\\.php)?.*$"); + m = p.matcher(url.toExternalForm()); + if (m.matches()) { + return m.group(1); + } + + throw new MalformedURLException( + "Expected *chan URL formats: " + + "*chan.com/@/res/####.html" + + " Got: " + url); + } + + @Override + public void rip() throws IOException { + Set attempted = new HashSet(); + int index = 0; + Pattern p; Matcher m; + logger.info(" Retrieving " + this.url.toExternalForm()); + Document doc = Jsoup.connect(this.url.toExternalForm()) + .userAgent(USER_AGENT) + .get(); + for (Element link : doc.select("a")) { + if (!link.hasAttr("href")) { + continue; + } + if (!link.attr("href").contains("/src/")) { + logger.debug("Skipping link that does not contain /src/: " + link.attr("href")); + continue; + } + System.err.println("URL=" + link.attr("href")); + p = Pattern.compile("^.*\\.(jpg|jpeg|png|gif)$", Pattern.CASE_INSENSITIVE); + m = p.matcher(link.attr("href")); + if (m.matches()) { + String image = link.attr("href"); + if (image.startsWith("//")) { + image = "http:" + image; + } + if (image.startsWith("/")) { + image = "http://" + this.url.getHost() + image; + } + if (attempted.contains(image)) { + logger.debug("Already attempted: " + image); + continue; + } + index += 1; + addURLToDownload(new URL(image), String.format("%03d_", index)); + attempted.add(image); + } + } + waitForThreads(); + } + +} \ No newline at end of file diff --git a/src/main/java/com/rarchives/ripme/ui/UpdateUtils.java b/src/main/java/com/rarchives/ripme/ui/UpdateUtils.java index fe04a388f..dd75e3ead 100644 --- a/src/main/java/com/rarchives/ripme/ui/UpdateUtils.java +++ b/src/main/java/com/rarchives/ripme/ui/UpdateUtils.java @@ -19,7 +19,7 @@ public class UpdateUtils { private static final Logger logger = Logger.getLogger(UpdateUtils.class); - private static final String DEFAULT_VERSION = "1.0.14"; + private static final String DEFAULT_VERSION = "1.0.17"; private static final String updateJsonURL = "http://rarchives.com/ripme.json"; private static final String updateJarURL = "http://rarchives.com/ripme.jar"; private static final String mainFileName = "ripme.jar"; diff --git a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/ChanRipperTest.java b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/ChanRipperTest.java new file mode 100644 index 000000000..6f6a77c41 --- /dev/null +++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/ChanRipperTest.java @@ -0,0 +1,71 @@ +package com.rarchives.ripme.tst.ripper.rippers; + +import java.io.IOException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; + +import com.rarchives.ripme.ripper.rippers.ChanRipper; + +public class ChanRipperTest extends RippersTest { + + public void testChanURLFailures() throws IOException { + List failURLs = new ArrayList(); + // URLs that should not work + for (URL url : failURLs) { + try { + new ChanRipper(url); + fail("Instantiated ripper for URL that should not work: " + url); + } catch (Exception e) { + // Expected + continue; + } + } + } + + public void testChanURLPasses() throws IOException { + List passURLs = new ArrayList(); + // URLs that should work + passURLs.add(new URL("http://desuchan.net/v/res/7034.html")); + passURLs.add(new URL("http://boards.4chan.org/r/res/12225949")); + passURLs.add(new URL("http://boards.420chan.org/ana/res/75984.php")); + passURLs.add(new URL("http://7chan.org/gif/res/23795.html")); + passURLs.add(new URL("http://unichan2.org/b/res/518004.html")); + passURLs.add(new URL("http://xchan.pw/porn/res/437.html")); + for (URL url : passURLs) { + try { + ChanRipper ripper = new ChanRipper(url); + assert(ripper.canRip(url)); + deleteDir(ripper.getWorkingDir()); + } catch (Exception e) { + fail("Failed to instantiate ripper for " + url); + } + } + } + + public void testChanRipper() throws IOException { + if (!DOWNLOAD_CONTENT) { + return; + } + List contentURLs = new ArrayList(); + // URLs that should return more than 1 image + contentURLs.add(new URL("http://desuchan.net/v/res/7034.html")); + contentURLs.add(new URL("http://boards.4chan.org/r/res/12225949")); + contentURLs.add(new URL("http://boards.420chan.org/ana/res/75984.php")); + contentURLs.add(new URL("http://7chan.org/gif/res/23795.html")); + contentURLs.add(new URL("http://unichan2.org/b/res/518004.html")); + contentURLs.add(new URL("http://xchan.pw/porn/res/437.html")); + for (URL url : contentURLs) { + try { + ChanRipper ripper = new ChanRipper(url); + ripper.rip(); + assert(ripper.getWorkingDir().listFiles().length > 1); + deleteDir(ripper.getWorkingDir()); + } catch (Exception e) { + e.printStackTrace(); + fail("Error while ripping URL " + url + ": " + e.getMessage()); + } + } + } + +}