Skip to content

Commit cfb99e0

Browse files
committed
1.0.17 - Added generic *chan ripper #8
1 parent 3a2dcdb commit cfb99e0

4 files changed

Lines changed: 180 additions & 2 deletions

File tree

pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
<groupId>com.rarchives.ripme</groupId>
55
<artifactId>ripme</artifactId>
66
<packaging>jar</packaging>
7-
<version>1.0.16</version>
7+
<version>1.0.17</version>
88
<name>ripme</name>
99
<url>http://rip.rarchives.com</url>
1010
<properties>
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
package com.rarchives.ripme.ripper.rippers;
2+
3+
import java.io.IOException;
4+
import java.net.MalformedURLException;
5+
import java.net.URL;
6+
import java.util.HashSet;
7+
import java.util.Set;
8+
import java.util.regex.Matcher;
9+
import java.util.regex.Pattern;
10+
11+
import org.apache.log4j.Logger;
12+
import org.jsoup.Jsoup;
13+
import org.jsoup.nodes.Document;
14+
import org.jsoup.nodes.Element;
15+
16+
import com.rarchives.ripme.ripper.AbstractRipper;
17+
18+
public class ChanRipper extends AbstractRipper {
19+
20+
private static final Logger logger = Logger.getLogger(ChanRipper.class);
21+
22+
public ChanRipper(URL url) throws IOException {
23+
super(url);
24+
}
25+
26+
@Override
27+
public String getHost() {
28+
String host = this.url.getHost();
29+
host = host.substring(0, host.lastIndexOf('.'));
30+
if (host.contains(".")) {
31+
// Host has subdomain (www)
32+
host = host.substring(host.lastIndexOf('.') + 1);
33+
}
34+
String board = this.url.toExternalForm().split("/")[3];
35+
return host + "_" + board;
36+
}
37+
38+
@Override
39+
public boolean canRip(URL url) {
40+
// TODO Whitelist?
41+
return url.getHost().contains("chan") && url.toExternalForm().contains("/res/");
42+
}
43+
44+
/**
45+
* Reformat given URL into the desired format (all images on single page)
46+
*/
47+
public URL sanitizeURL(URL url) throws MalformedURLException {
48+
return url;
49+
}
50+
51+
@Override
52+
public String getGID(URL url) throws MalformedURLException {
53+
Pattern p; Matcher m;
54+
55+
p = Pattern.compile("^.*chan.*\\.[a-z]{2,3}/[a-z]+/res/([0-9]+)(\\.html|\\.php)?.*$");
56+
m = p.matcher(url.toExternalForm());
57+
if (m.matches()) {
58+
return m.group(1);
59+
}
60+
61+
throw new MalformedURLException(
62+
"Expected *chan URL formats: "
63+
+ "*chan.com/@/res/####.html"
64+
+ " Got: " + url);
65+
}
66+
67+
@Override
68+
public void rip() throws IOException {
69+
Set<String> attempted = new HashSet<String>();
70+
int index = 0;
71+
Pattern p; Matcher m;
72+
logger.info(" Retrieving " + this.url.toExternalForm());
73+
Document doc = Jsoup.connect(this.url.toExternalForm())
74+
.userAgent(USER_AGENT)
75+
.get();
76+
for (Element link : doc.select("a")) {
77+
if (!link.hasAttr("href")) {
78+
continue;
79+
}
80+
if (!link.attr("href").contains("/src/")) {
81+
logger.debug("Skipping link that does not contain /src/: " + link.attr("href"));
82+
continue;
83+
}
84+
System.err.println("URL=" + link.attr("href"));
85+
p = Pattern.compile("^.*\\.(jpg|jpeg|png|gif)$", Pattern.CASE_INSENSITIVE);
86+
m = p.matcher(link.attr("href"));
87+
if (m.matches()) {
88+
String image = link.attr("href");
89+
if (image.startsWith("//")) {
90+
image = "http:" + image;
91+
}
92+
if (image.startsWith("/")) {
93+
image = "http://" + this.url.getHost() + image;
94+
}
95+
if (attempted.contains(image)) {
96+
logger.debug("Already attempted: " + image);
97+
continue;
98+
}
99+
index += 1;
100+
addURLToDownload(new URL(image), String.format("%03d_", index));
101+
attempted.add(image);
102+
}
103+
}
104+
waitForThreads();
105+
}
106+
107+
}

src/main/java/com/rarchives/ripme/ui/UpdateUtils.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
public class UpdateUtils {
2020

2121
private static final Logger logger = Logger.getLogger(UpdateUtils.class);
22-
private static final String DEFAULT_VERSION = "1.0.14";
22+
private static final String DEFAULT_VERSION = "1.0.17";
2323
private static final String updateJsonURL = "http://rarchives.com/ripme.json";
2424
private static final String updateJarURL = "http://rarchives.com/ripme.jar";
2525
private static final String mainFileName = "ripme.jar";
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
package com.rarchives.ripme.tst.ripper.rippers;
2+
3+
import java.io.IOException;
4+
import java.net.URL;
5+
import java.util.ArrayList;
6+
import java.util.List;
7+
8+
import com.rarchives.ripme.ripper.rippers.ChanRipper;
9+
10+
public class ChanRipperTest extends RippersTest {
11+
12+
public void testChanURLFailures() throws IOException {
13+
List<URL> failURLs = new ArrayList<URL>();
14+
// URLs that should not work
15+
for (URL url : failURLs) {
16+
try {
17+
new ChanRipper(url);
18+
fail("Instantiated ripper for URL that should not work: " + url);
19+
} catch (Exception e) {
20+
// Expected
21+
continue;
22+
}
23+
}
24+
}
25+
26+
public void testChanURLPasses() throws IOException {
27+
List<URL> passURLs = new ArrayList<URL>();
28+
// URLs that should work
29+
passURLs.add(new URL("http://desuchan.net/v/res/7034.html"));
30+
passURLs.add(new URL("http://boards.4chan.org/r/res/12225949"));
31+
passURLs.add(new URL("http://boards.420chan.org/ana/res/75984.php"));
32+
passURLs.add(new URL("http://7chan.org/gif/res/23795.html"));
33+
passURLs.add(new URL("http://unichan2.org/b/res/518004.html"));
34+
passURLs.add(new URL("http://xchan.pw/porn/res/437.html"));
35+
for (URL url : passURLs) {
36+
try {
37+
ChanRipper ripper = new ChanRipper(url);
38+
assert(ripper.canRip(url));
39+
deleteDir(ripper.getWorkingDir());
40+
} catch (Exception e) {
41+
fail("Failed to instantiate ripper for " + url);
42+
}
43+
}
44+
}
45+
46+
public void testChanRipper() throws IOException {
47+
if (!DOWNLOAD_CONTENT) {
48+
return;
49+
}
50+
List<URL> contentURLs = new ArrayList<URL>();
51+
// URLs that should return more than 1 image
52+
contentURLs.add(new URL("http://desuchan.net/v/res/7034.html"));
53+
contentURLs.add(new URL("http://boards.4chan.org/r/res/12225949"));
54+
contentURLs.add(new URL("http://boards.420chan.org/ana/res/75984.php"));
55+
contentURLs.add(new URL("http://7chan.org/gif/res/23795.html"));
56+
contentURLs.add(new URL("http://unichan2.org/b/res/518004.html"));
57+
contentURLs.add(new URL("http://xchan.pw/porn/res/437.html"));
58+
for (URL url : contentURLs) {
59+
try {
60+
ChanRipper ripper = new ChanRipper(url);
61+
ripper.rip();
62+
assert(ripper.getWorkingDir().listFiles().length > 1);
63+
deleteDir(ripper.getWorkingDir());
64+
} catch (Exception e) {
65+
e.printStackTrace();
66+
fail("Error while ripping URL " + url + ": " + e.getMessage());
67+
}
68+
}
69+
}
70+
71+
}

0 commit comments

Comments
 (0)