forked from termux/termux-app
-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Compile Url match regex once and not on every use
Needed for termux#2146.
- Loading branch information
1 parent
630809a
commit 5a0774b
Showing
4 changed files
with
107 additions
and
91 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
103 changes: 103 additions & 0 deletions
103
termux-shared/src/main/java/com/termux/shared/data/UrlUtils.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
package com.termux.shared.data; | ||
|
||
import java.util.LinkedHashSet; | ||
import java.util.regex.Matcher; | ||
import java.util.regex.Pattern; | ||
|
||
public class UrlUtils { | ||
|
||
public static Pattern URL_MATCH_REGEX; | ||
|
||
public static Pattern getUrlMatchRegex() { | ||
if (URL_MATCH_REGEX != null) return URL_MATCH_REGEX; | ||
|
||
StringBuilder regex_sb = new StringBuilder(); | ||
|
||
regex_sb.append("("); // Begin first matching group. | ||
regex_sb.append("(?:"); // Begin scheme group. | ||
regex_sb.append("dav|"); // The DAV proto. | ||
regex_sb.append("dict|"); // The DICT proto. | ||
regex_sb.append("dns|"); // The DNS proto. | ||
regex_sb.append("file|"); // File path. | ||
regex_sb.append("finger|"); // The Finger proto. | ||
regex_sb.append("ftp(?:s?)|"); // The FTP proto. | ||
regex_sb.append("git|"); // The Git proto. | ||
regex_sb.append("gopher|"); // The Gopher proto. | ||
regex_sb.append("http(?:s?)|"); // The HTTP proto. | ||
regex_sb.append("imap(?:s?)|"); // The IMAP proto. | ||
regex_sb.append("irc(?:[6s]?)|"); // The IRC proto. | ||
regex_sb.append("ip[fn]s|"); // The IPFS proto. | ||
regex_sb.append("ldap(?:s?)|"); // The LDAP proto. | ||
regex_sb.append("pop3(?:s?)|"); // The POP3 proto. | ||
regex_sb.append("redis(?:s?)|"); // The Redis proto. | ||
regex_sb.append("rsync|"); // The Rsync proto. | ||
regex_sb.append("rtsp(?:[su]?)|"); // The RTSP proto. | ||
regex_sb.append("sftp|"); // The SFTP proto. | ||
regex_sb.append("smb(?:s?)|"); // The SAMBA proto. | ||
regex_sb.append("smtp(?:s?)|"); // The SMTP proto. | ||
regex_sb.append("svn(?:(?:\\+ssh)?)|"); // The Subversion proto. | ||
regex_sb.append("tcp|"); // The TCP proto. | ||
regex_sb.append("telnet|"); // The Telnet proto. | ||
regex_sb.append("tftp|"); // The TFTP proto. | ||
regex_sb.append("udp|"); // The UDP proto. | ||
regex_sb.append("vnc|"); // The VNC proto. | ||
regex_sb.append("ws(?:s?)"); // The Websocket proto. | ||
regex_sb.append(")://"); // End scheme group. | ||
regex_sb.append(")"); // End first matching group. | ||
|
||
|
||
// Begin second matching group. | ||
regex_sb.append("("); | ||
|
||
// User name and/or password in format 'user:pass@'. | ||
regex_sb.append("(?:\\S+(?::\\S*)?@)?"); | ||
|
||
// Begin host group. | ||
regex_sb.append("(?:"); | ||
|
||
// IP address (from http://www.regular-expressions.info/examples.html). | ||
regex_sb.append("(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|"); | ||
|
||
// Host name or domain. | ||
regex_sb.append("(?:(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)(?:(?:\\.(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)*(?:\\.(?:[a-z\\u00a1-\\uffff]{2,})))?|"); | ||
|
||
// Just path. Used in case of 'file://' scheme. | ||
regex_sb.append("/(?:(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)"); | ||
|
||
// End host group. | ||
regex_sb.append(")"); | ||
|
||
// Port number. | ||
regex_sb.append("(?::\\d{1,5})?"); | ||
|
||
// Resource path with optional query string. | ||
regex_sb.append("(?:/[a-zA-Z0-9:@%\\-._~!$&()*+,;=?/]*)?"); | ||
|
||
// Fragment. | ||
regex_sb.append("(?:#[a-zA-Z0-9:@%\\-._~!$&()*+,;=?/]*)?"); | ||
|
||
// End second matching group. | ||
regex_sb.append(")"); | ||
|
||
URL_MATCH_REGEX = Pattern.compile( | ||
regex_sb.toString(), | ||
Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL); | ||
|
||
return URL_MATCH_REGEX; | ||
} | ||
|
||
public static LinkedHashSet<CharSequence> extractUrls(String text) { | ||
LinkedHashSet<CharSequence> urlSet = new LinkedHashSet<>(); | ||
Matcher matcher = getUrlMatchRegex().matcher(text); | ||
|
||
while (matcher.find()) { | ||
int matchStart = matcher.start(1); | ||
int matchEnd = matcher.end(); | ||
String url = text.substring(matchStart, matchEnd); | ||
urlSet.add(url); | ||
} | ||
|
||
return urlSet; | ||
} | ||
|
||
} |