Skip to content

Commit

Permalink
Added twitter support.
Browse files Browse the repository at this point in the history
  • Loading branch information
4pr0n committed Mar 5, 2014
1 parent 0fc42d8 commit 4a47cc6
Show file tree
Hide file tree
Showing 3 changed files with 312 additions and 1 deletion.
282 changes: 282 additions & 0 deletions src/main/java/com/rarchives/ripme/ripper/rippers/TwitterRipper.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,282 @@
package com.rarchives.ripme.ripper.rippers;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.log4j.Logger;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.json.JSONTokener;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import com.rarchives.ripme.ripper.AbstractRipper;
import com.rarchives.ripme.utils.Utils;

public class TwitterRipper extends AbstractRipper {

private static final String DOMAIN = "twitter.com",
HOST = "twitter";
private static final Logger logger = Logger.getLogger(TwitterRipper.class);

private static final int MAX_REQUESTS = 2;
private static final int WAIT_TIME = 2000;

// Base 64 of consumer key : consumer secret
private String authKey;
private String accessToken;

private enum ALBUM_TYPE {
ACCOUNT,
SEARCH
}
private ALBUM_TYPE albumType;
private String searchText, accountName;

public TwitterRipper(URL url) throws IOException {
super(url);
authKey = Utils.getConfigString("twitter.auth", null);
if (authKey == null) {
throw new IOException("Could not find twitter authentication key in configuration");
}
}

@Override
public boolean canRip(URL url) {
return url.getHost().endsWith(DOMAIN);
}

@Override
public URL sanitizeURL(URL url) throws MalformedURLException {
// https://twitter.com/search?q=from%3Apurrbunny%20filter%3Aimages&src=typd
Pattern p = Pattern.compile("^https?://(m\\.)?twitter\\.com/search\\?q=([a-zA-Z0-9%]{1,}).*$");
Matcher m = p.matcher(url.toExternalForm());
if (m.matches()) {
albumType = ALBUM_TYPE.SEARCH;
searchText = m.group(2);
return url;
}
p = Pattern.compile("^https?://(m\\.)?twitter\\.com/([a-zA-Z0-9]{1,}).*$");
m = p.matcher(url.toExternalForm());
if (m.matches()) {
albumType = ALBUM_TYPE.ACCOUNT;
accountName = m.group(2);
return url;
}
throw new MalformedURLException("Expected username or search string in url: " + url);
}

private void getAccessToken() throws IOException {
Document doc = Jsoup.connect("https://api.twitter.com/oauth2/token")
.ignoreContentType(true)
.header("Authorization", "Basic " + authKey)
.header("Content-Type", "application/x-www-form-urlencoded;charset=UTF-8")
.header("User-agent", "ripe and zipe")
.data("grant_type", "client_credentials")
.post();
String body = doc.body().html().replaceAll(""", "\"");
try {
JSONObject json = new JSONObject(body);
accessToken = json.getString("access_token");
return;
} catch (JSONException e) {
// Fall through
throw new IOException("Failure while parsing JSON: " + body, e);
}
}

private void checkRateLimits(String resource, String api) throws IOException {
Document doc = Jsoup.connect("https://api.twitter.com/1.1/application/rate_limit_status.json?resources=" + resource)
.ignoreContentType(true)
.header("Authorization", "Bearer " + accessToken)
.header("Content-Type", "application/x-www-form-urlencoded;charset=UTF-8")
.header("User-agent", "ripe and zipe")
.get();
String body = doc.body().html().replaceAll(""", "\"");
try {
JSONObject json = new JSONObject(body);
JSONObject stats = json.getJSONObject("resources")
.getJSONObject(resource)
.getJSONObject(api);
int remaining = stats.getInt("remaining");
logger.info(" Twitter " + resource + " calls remaining: " + remaining);
if (remaining < 20) {
logger.error("Twitter API calls exhausted: " + stats.toString());
throw new IOException("Less than 20 API calls remaining; not enough to rip.");
}
} catch (JSONException e) {
logger.error("JSONException: ", e);
throw new IOException("Error while parsing JSON: " + body, e);
}
}

private String getApiURL(String maxID) {
String req = "";
switch (albumType) {
case ACCOUNT:
req = "https://api.twitter.com/1.1/statuses/user_timeline.json"
+ "?screen_name=" + this.accountName
+ "&include_entities=true"
+ "&exclude_replies=true"
+ "&trim_user=true"
+ "&include_rts=false"
+ "&count=" + 200;
break;
case SEARCH:
req = "https://api.twitter.com/1.1/search/tweets.json"
+ "?q=" + this.searchText
+ "&include_entities=true"
+ "&result_type=recent"
+ "&count=100";
break;
}
if (maxID != null) {
req += "&max_id=" + maxID;
}
return req;
}

private List<JSONObject> getTweets(String url) throws IOException {
List<JSONObject> tweets = new ArrayList<JSONObject>();
logger.info(" Retrieving " + url);
Document doc = Jsoup.connect(url)
.ignoreContentType(true)
.header("Authorization", "Bearer " + accessToken)
.header("Content-Type", "application/x-www-form-urlencoded;charset=UTF-8")
.header("User-agent", "ripe and zipe")
.get();
String body = doc.body().html().replaceAll("&quot;", "\"");
Object jsonObj = new JSONTokener(body).nextValue();
JSONArray statuses;
if (jsonObj instanceof JSONObject) {
JSONObject json = (JSONObject) jsonObj;
if (json.has("errors")) {
String msg = json.getJSONObject("errors").getString("message");
throw new IOException("Twitter responded with errors: " + msg);
}
statuses = json.getJSONArray("statuses");
} else {
statuses = (JSONArray) jsonObj;
}
for (int i = 0; i < statuses.length(); i++) {
tweets.add((JSONObject) statuses.get(i));
}
return tweets;
}

private void parseTweet(JSONObject tweet) throws MalformedURLException {
if (!tweet.has("entities")) {
logger.error("XXX Tweet doesn't have entitites");
return;
}

JSONObject entities = tweet.getJSONObject("entities");

if (entities.has("media")) {
JSONArray medias = entities.getJSONArray("media");
String url;
JSONObject media;
for (int i = 0; i < medias.length(); i++) {
media = (JSONObject) medias.get(i);
url = media.getString("media_url");
if (url.contains(".twimg.com/")) {
url += ":large";
}
addURLToDownload(new URL(url));
}
}

if (entities.has("urls")) {
JSONArray urls = entities.getJSONArray("urls");
JSONObject url;
for (int i = 0; i < urls.length(); i++) {
url = (JSONObject) urls.get(i);
if (url.get("expanded_url") != null) {
handleTweetedURL(url.getString("url"));
} else {
handleTweetedURL(url.getString("expanded_url"));
}
}
}
}

private void handleTweetedURL(String url) {
logger.error("[!] Need to handle URL: " + url);
}

@Override
public void rip() throws IOException {
getAccessToken();

switch (albumType) {
case ACCOUNT:
checkRateLimits("statuses", "/statuses/user_timeline");
break;
case SEARCH:
checkRateLimits("search", "/search/tweets");
break;
}

String maxID = null;
for (int i = 0; i < MAX_REQUESTS; i++) {
List<JSONObject> tweets = getTweets(getApiURL(maxID));
if (tweets.size() == 0) {
logger.info(" No more tweets found.");
break;
}
for (JSONObject tweet : tweets) {
maxID = tweet.getString("id_str");
parseTweet(tweet);
}

try {
Thread.sleep(WAIT_TIME);
} catch (InterruptedException e) {
logger.error("[!] Interrupted while waiting to load more results", e);
break;
}
}

waitForThreads();
}

@Override
public String getHost() {
return HOST;
}

@Override
public String getGID(URL url) throws MalformedURLException {
switch (albumType) {
case ACCOUNT:
return "account_" + accountName;
case SEARCH:
StringBuilder gid = new StringBuilder();
for (int i = 0; i < searchText.length(); i++) {
char c = searchText.charAt(i);
// Ignore URL-encoded chars
if (c == '%') {
gid.append('_');
i += 2;
continue;
// Ignore non-alphanumeric chars
} else if (
(c >= 'a' && c <= 'z')
|| (c >= 'A' && c <= 'Z')
|| (c >= '0' && c <= '9')
) {
gid.append(c);
}
}
return "search_" + gid.toString();
}
throw new MalformedURLException("Could not decide type of URL (search/account): " + url);
}

}
3 changes: 2 additions & 1 deletion src/main/resources/rip.properties
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
threads.size = 5
file.overwrite = false
download.retries = 3
download.retries = 3
twitter.auth = VW9Ybjdjb1pkd2J0U3kwTUh2VXVnOm9GTzVQVzNqM29LQU1xVGhnS3pFZzhKbGVqbXU0c2lHQ3JrUFNNZm8=
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
package com.rarchives.ripme.tst.ripper.rippers;

import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

import com.rarchives.ripme.ripper.rippers.TwitterRipper;

public class TwitterRipperTest extends RippersTest {

public void testTwitterAlbums() throws IOException {
List<URL> contentURLs = new ArrayList<URL>();
//contentURLs.add(new URL("https://twitter.com/danngamber01/media"));
contentURLs.add(new URL("https://twitter.com/search?q=from%3Apurrbunny%20filter%3Aimages&src=typd"));
for (URL url : contentURLs) {
try {
TwitterRipper ripper = new TwitterRipper(url);
ripper.rip();
assert(ripper.getWorkingDir().listFiles().length > 1);
deleteDir(ripper.getWorkingDir());
} catch (Exception e) {
e.printStackTrace();
fail("Error while ripping URL " + url + ": " + e.getMessage());
}
}
}
}

0 comments on commit 4a47cc6

Please sign in to comment.