Skip to content

Commit

Permalink
Compile Url match regex once and not on every use
Browse files Browse the repository at this point in the history
Needed for termux#2146.
  • Loading branch information
agnostic-apollo committed Jun 29, 2021
1 parent 630809a commit 5a0774b
Show file tree
Hide file tree
Showing 4 changed files with 107 additions and 91 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

import com.termux.R;
import com.termux.app.TermuxActivity;
import com.termux.shared.data.UrlUtils;
import com.termux.shared.shell.ShellUtils;
import com.termux.shared.terminal.TermuxTerminalViewClientBase;
import com.termux.shared.termux.TermuxConstants;
Expand Down Expand Up @@ -603,7 +604,7 @@ public void showUrlSelection() {

String text = ShellUtils.getTerminalSessionTranscriptText(session, true, true);

LinkedHashSet<CharSequence> urlSet = DataUtils.extractUrls(text);
LinkedHashSet<CharSequence> urlSet = UrlUtils.extractUrls(text);
if (urlSet.isEmpty()) {
new AlertDialog.Builder(mActivity).setMessage(R.string.title_select_url_none_found).show();
return;
Expand Down
4 changes: 2 additions & 2 deletions app/src/test/java/com/termux/app/TermuxActivityTest.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package com.termux.app;

import com.termux.shared.data.DataUtils;
import com.termux.shared.data.UrlUtils;

import org.junit.Assert;
import org.junit.Test;
Expand All @@ -13,7 +13,7 @@ public class TermuxActivityTest {
private void assertUrlsAre(String text, String... urls) {
LinkedHashSet<String> expected = new LinkedHashSet<>();
Collections.addAll(expected, urls);
Assert.assertEquals(expected, DataUtils.extractUrls(text));
Assert.assertEquals(expected, UrlUtils.extractUrls(text));
}

@Test
Expand Down
88 changes: 0 additions & 88 deletions termux-shared/src/main/java/com/termux/shared/data/DataUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -165,92 +165,4 @@ public static boolean isNullOrEmpty(String string) {
return string == null || string.isEmpty();
}


public static LinkedHashSet<CharSequence> extractUrls(String text) {

StringBuilder regex_sb = new StringBuilder();

regex_sb.append("("); // Begin first matching group.
regex_sb.append("(?:"); // Begin scheme group.
regex_sb.append("dav|"); // The DAV proto.
regex_sb.append("dict|"); // The DICT proto.
regex_sb.append("dns|"); // The DNS proto.
regex_sb.append("file|"); // File path.
regex_sb.append("finger|"); // The Finger proto.
regex_sb.append("ftp(?:s?)|"); // The FTP proto.
regex_sb.append("git|"); // The Git proto.
regex_sb.append("gopher|"); // The Gopher proto.
regex_sb.append("http(?:s?)|"); // The HTTP proto.
regex_sb.append("imap(?:s?)|"); // The IMAP proto.
regex_sb.append("irc(?:[6s]?)|"); // The IRC proto.
regex_sb.append("ip[fn]s|"); // The IPFS proto.
regex_sb.append("ldap(?:s?)|"); // The LDAP proto.
regex_sb.append("pop3(?:s?)|"); // The POP3 proto.
regex_sb.append("redis(?:s?)|"); // The Redis proto.
regex_sb.append("rsync|"); // The Rsync proto.
regex_sb.append("rtsp(?:[su]?)|"); // The RTSP proto.
regex_sb.append("sftp|"); // The SFTP proto.
regex_sb.append("smb(?:s?)|"); // The SAMBA proto.
regex_sb.append("smtp(?:s?)|"); // The SMTP proto.
regex_sb.append("svn(?:(?:\\+ssh)?)|"); // The Subversion proto.
regex_sb.append("tcp|"); // The TCP proto.
regex_sb.append("telnet|"); // The Telnet proto.
regex_sb.append("tftp|"); // The TFTP proto.
regex_sb.append("udp|"); // The UDP proto.
regex_sb.append("vnc|"); // The VNC proto.
regex_sb.append("ws(?:s?)"); // The Websocket proto.
regex_sb.append(")://"); // End scheme group.
regex_sb.append(")"); // End first matching group.


// Begin second matching group.
regex_sb.append("(");

// User name and/or password in format 'user:pass@'.
regex_sb.append("(?:\\S+(?::\\S*)?@)?");

// Begin host group.
regex_sb.append("(?:");

// IP address (from http://www.regular-expressions.info/examples.html).
regex_sb.append("(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|");

// Host name or domain.
regex_sb.append("(?:(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)(?:(?:\\.(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)*(?:\\.(?:[a-z\\u00a1-\\uffff]{2,})))?|");

// Just path. Used in case of 'file://' scheme.
regex_sb.append("/(?:(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)");

// End host group.
regex_sb.append(")");

// Port number.
regex_sb.append("(?::\\d{1,5})?");

// Resource path with optional query string.
regex_sb.append("(?:/[a-zA-Z0-9:@%\\-._~!$&()*+,;=?/]*)?");

// Fragment.
regex_sb.append("(?:#[a-zA-Z0-9:@%\\-._~!$&()*+,;=?/]*)?");

// End second matching group.
regex_sb.append(")");

final Pattern urlPattern = Pattern.compile(
regex_sb.toString(),
Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL);

LinkedHashSet<CharSequence> urlSet = new LinkedHashSet<>();
Matcher matcher = urlPattern.matcher(text);

while (matcher.find()) {
int matchStart = matcher.start(1);
int matchEnd = matcher.end();
String url = text.substring(matchStart, matchEnd);
urlSet.add(url);
}

return urlSet;
}

}
103 changes: 103 additions & 0 deletions termux-shared/src/main/java/com/termux/shared/data/UrlUtils.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
package com.termux.shared.data;

import java.util.LinkedHashSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class UrlUtils {

public static Pattern URL_MATCH_REGEX;

public static Pattern getUrlMatchRegex() {
if (URL_MATCH_REGEX != null) return URL_MATCH_REGEX;

StringBuilder regex_sb = new StringBuilder();

regex_sb.append("("); // Begin first matching group.
regex_sb.append("(?:"); // Begin scheme group.
regex_sb.append("dav|"); // The DAV proto.
regex_sb.append("dict|"); // The DICT proto.
regex_sb.append("dns|"); // The DNS proto.
regex_sb.append("file|"); // File path.
regex_sb.append("finger|"); // The Finger proto.
regex_sb.append("ftp(?:s?)|"); // The FTP proto.
regex_sb.append("git|"); // The Git proto.
regex_sb.append("gopher|"); // The Gopher proto.
regex_sb.append("http(?:s?)|"); // The HTTP proto.
regex_sb.append("imap(?:s?)|"); // The IMAP proto.
regex_sb.append("irc(?:[6s]?)|"); // The IRC proto.
regex_sb.append("ip[fn]s|"); // The IPFS proto.
regex_sb.append("ldap(?:s?)|"); // The LDAP proto.
regex_sb.append("pop3(?:s?)|"); // The POP3 proto.
regex_sb.append("redis(?:s?)|"); // The Redis proto.
regex_sb.append("rsync|"); // The Rsync proto.
regex_sb.append("rtsp(?:[su]?)|"); // The RTSP proto.
regex_sb.append("sftp|"); // The SFTP proto.
regex_sb.append("smb(?:s?)|"); // The SAMBA proto.
regex_sb.append("smtp(?:s?)|"); // The SMTP proto.
regex_sb.append("svn(?:(?:\\+ssh)?)|"); // The Subversion proto.
regex_sb.append("tcp|"); // The TCP proto.
regex_sb.append("telnet|"); // The Telnet proto.
regex_sb.append("tftp|"); // The TFTP proto.
regex_sb.append("udp|"); // The UDP proto.
regex_sb.append("vnc|"); // The VNC proto.
regex_sb.append("ws(?:s?)"); // The Websocket proto.
regex_sb.append(")://"); // End scheme group.
regex_sb.append(")"); // End first matching group.


// Begin second matching group.
regex_sb.append("(");

// User name and/or password in format 'user:pass@'.
regex_sb.append("(?:\\S+(?::\\S*)?@)?");

// Begin host group.
regex_sb.append("(?:");

// IP address (from http://www.regular-expressions.info/examples.html).
regex_sb.append("(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|");

// Host name or domain.
regex_sb.append("(?:(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)(?:(?:\\.(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)*(?:\\.(?:[a-z\\u00a1-\\uffff]{2,})))?|");

// Just path. Used in case of 'file://' scheme.
regex_sb.append("/(?:(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)");

// End host group.
regex_sb.append(")");

// Port number.
regex_sb.append("(?::\\d{1,5})?");

// Resource path with optional query string.
regex_sb.append("(?:/[a-zA-Z0-9:@%\\-._~!$&()*+,;=?/]*)?");

// Fragment.
regex_sb.append("(?:#[a-zA-Z0-9:@%\\-._~!$&()*+,;=?/]*)?");

// End second matching group.
regex_sb.append(")");

URL_MATCH_REGEX = Pattern.compile(
regex_sb.toString(),
Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL);

return URL_MATCH_REGEX;
}

public static LinkedHashSet<CharSequence> extractUrls(String text) {
LinkedHashSet<CharSequence> urlSet = new LinkedHashSet<>();
Matcher matcher = getUrlMatchRegex().matcher(text);

while (matcher.find()) {
int matchStart = matcher.start(1);
int matchEnd = matcher.end();
String url = text.substring(matchStart, matchEnd);
urlSet.add(url);
}

return urlSet;
}

}

0 comments on commit 5a0774b

Please sign in to comment.