1+ package com .rarchives .ripme .ripper .rippers ;
2+
3+ import java .io .IOException ;
4+ import java .net .MalformedURLException ;
5+ import java .net .URL ;
6+ import java .util .HashSet ;
7+ import java .util .Set ;
8+ import java .util .regex .Matcher ;
9+ import java .util .regex .Pattern ;
10+
11+ import org .apache .log4j .Logger ;
12+ import org .jsoup .Jsoup ;
13+ import org .jsoup .nodes .Document ;
14+ import org .jsoup .nodes .Element ;
15+
16+ import com .rarchives .ripme .ripper .AbstractRipper ;
17+
18+ public class ChanRipper extends AbstractRipper {
19+
20+ private static final Logger logger = Logger .getLogger (ChanRipper .class );
21+
22+ public ChanRipper (URL url ) throws IOException {
23+ super (url );
24+ }
25+
26+ @ Override
27+ public String getHost () {
28+ String host = this .url .getHost ();
29+ host = host .substring (0 , host .lastIndexOf ('.' ));
30+ if (host .contains ("." )) {
31+ // Host has subdomain (www)
32+ host = host .substring (host .lastIndexOf ('.' ) + 1 );
33+ }
34+ String board = this .url .toExternalForm ().split ("/" )[3 ];
35+ return host + "_" + board ;
36+ }
37+
38+ @ Override
39+ public boolean canRip (URL url ) {
40+ // TODO Whitelist?
41+ return url .getHost ().contains ("chan" ) && url .toExternalForm ().contains ("/res/" );
42+ }
43+
44+ /**
45+ * Reformat given URL into the desired format (all images on single page)
46+ */
47+ public URL sanitizeURL (URL url ) throws MalformedURLException {
48+ return url ;
49+ }
50+
51+ @ Override
52+ public String getGID (URL url ) throws MalformedURLException {
53+ Pattern p ; Matcher m ;
54+
55+ p = Pattern .compile ("^.*chan.*\\ .[a-z]{2,3}/[a-z]+/res/([0-9]+)(\\ .html|\\ .php)?.*$" );
56+ m = p .matcher (url .toExternalForm ());
57+ if (m .matches ()) {
58+ return m .group (1 );
59+ }
60+
61+ throw new MalformedURLException (
62+ "Expected *chan URL formats: "
63+ + "*chan.com/@/res/####.html"
64+ + " Got: " + url );
65+ }
66+
67+ @ Override
68+ public void rip () throws IOException {
69+ Set <String > attempted = new HashSet <String >();
70+ int index = 0 ;
71+ Pattern p ; Matcher m ;
72+ logger .info (" Retrieving " + this .url .toExternalForm ());
73+ Document doc = Jsoup .connect (this .url .toExternalForm ())
74+ .userAgent (USER_AGENT )
75+ .get ();
76+ for (Element link : doc .select ("a" )) {
77+ if (!link .hasAttr ("href" )) {
78+ continue ;
79+ }
80+ if (!link .attr ("href" ).contains ("/src/" )) {
81+ logger .debug ("Skipping link that does not contain /src/: " + link .attr ("href" ));
82+ continue ;
83+ }
84+ System .err .println ("URL=" + link .attr ("href" ));
85+ p = Pattern .compile ("^.*\\ .(jpg|jpeg|png|gif)$" , Pattern .CASE_INSENSITIVE );
86+ m = p .matcher (link .attr ("href" ));
87+ if (m .matches ()) {
88+ String image = link .attr ("href" );
89+ if (image .startsWith ("//" )) {
90+ image = "http:" + image ;
91+ }
92+ if (image .startsWith ("/" )) {
93+ image = "http://" + this .url .getHost () + image ;
94+ }
95+ if (attempted .contains (image )) {
96+ logger .debug ("Already attempted: " + image );
97+ continue ;
98+ }
99+ index += 1 ;
100+ addURLToDownload (new URL (image ), String .format ("%03d_" , index ));
101+ attempted .add (image );
102+ }
103+ }
104+ waitForThreads ();
105+ }
106+
107+ }
0 commit comments