Skip to content

Commit

Permalink
cleaning up and testing str_locate_all_bytes
Browse files Browse the repository at this point in the history
  • Loading branch information
JBGruber committed Oct 22, 2023
1 parent c5e4f1a commit 6e1d24d
Show file tree
Hide file tree
Showing 7 changed files with 68 additions and 10 deletions.
12 changes: 12 additions & 0 deletions R/data.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,15 @@
#' }
#' @source <https://github.com/bluesky-social/atproto>
"list_lexicons"


#' Regular expressions to match mentions and URLs
#'
#' @format ## `regexs`
#' A list with regular expressions to match:
#' \describe{
#' \item{mention_regex}{Mentions of user handles like @atproto.com}
#' \item{url_regex}{URLs}
#' }
#' @source <https://atproto.com/blog/create-post#mentions-and-links>
"regexs"
14 changes: 6 additions & 8 deletions R/feed.r
Original file line number Diff line number Diff line change
Expand Up @@ -536,27 +536,25 @@ get_feed_likes <- function(feed_url,
parse_facets <- function(text) {

facets <- list()
mention_regex <- "(?<=[$|\\W])(@([a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\\.)+[a-zA-Z]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)"
mentions <- str_locate_all_bytes(text, mention_regex)
mentions <- str_locate_all_bytes(text, regexs$mention_regex)
mentions$match <- stringr::str_remove(mentions$match, "@")
facets <- purrr::pmap(mentions, function(start, end, match) {

handle <- stringr::str_remove(match, "@")
did <- do.call(com_atproto_identity_resolve_handle, list(handle = handle)) |>
did <- do.call(com_atproto_identity_resolve_handle, list(handle = match)) |>
purrr::pluck("did")

list(
index = list(byteStart = start - 1, byteEnd = end - 1),
index = list(byteStart = start, byteEnd = end),
features = list(list("$type" = "app.bsky.richtext.facet#mention", "did" = did))
)

}) |>
append(facets)

url_regex <- "(?<=[$|\\W])(https?:\\/\\/(www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b([-a-zA-Z0-9()@:%_\\+.~#?&//=]*[-a-zA-Z0-9@%_\\+~#//=])?)"
urls <- str_locate_all_bytes(text, url_regex)
urls <- str_locate_all_bytes(text, regexs$url_regex)
facets <- purrr::pmap(urls, function(start, end, match) {
list(
index = list(byteStart = start - 1, byteEnd = end - 1),
index = list(byteStart = start, byteEnd = end),
features = list(list("$type" = "app.bsky.richtext.facet#link", "uri" = match))
)
}) |>
Expand Down
4 changes: 2 additions & 2 deletions R/utils.r
Original file line number Diff line number Diff line change
Expand Up @@ -246,11 +246,11 @@ str_locate_all_bytes <- function(string, pattern) {

if (nrow(spans) > 0) {
# add matched text before shifting locations
spans$match <- substr(string, spans$start, spans$end)
spans$match <- substr(string, spans$start + 1, spans$end)
# shift locations using byte lengths
for (i in seq_along(spans$start)) {
spans$start[i] <- sum(byte_len$b_len[1:spans$start[i]])
spans$end[i] = sum(byte_len$b_len[1:spans$end[i]]) + 1
spans$end[i] = sum(byte_len$b_len[1:spans$end[i]])
}
}
return(spans)
Expand Down
10 changes: 10 additions & 0 deletions data-raw/regexs.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
## code to prepare `regexs` dataset goes here
## copied and adapted from https://atproto.com/blog/create-post#mentions-and-links
regexs <- list(
# original: rb"[$|\W](@([a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)"
mention_regex = "[$|\\W]@(([a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\\.)+[a-zA-Z]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)",
# original: rb"[$|\W](https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*[-a-zA-Z0-9@%_\+~#//=])?)"
url_regex = "[$|\\W](https?:\\/\\/(www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b([-a-zA-Z0-9()@:%_\\+.~#?&//=]*[-a-zA-Z0-9@%_\\+~#//=])?)"
)

usethis::use_data(regexs, overwrite = TRUE)
Binary file added data/regexs.rda
Binary file not shown.
26 changes: 26 additions & 0 deletions man/regexs.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions tests/testthat/test-utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,15 @@ test_that("verbosity", {
FALSE
)
})


test_that("byte-accurate string locate", {
text <- "✨ example mentioning @atproto.com to share the URL 👨‍❤️‍👨 https://en.wikipedia.org/wiki/CBOR."
# tested against outcome of the code in <https://atproto.com/blog/create-post#mentions-and-links>
expect_equal({
str_locate_all_bytes(text, regexs$mention_regex)
}, tibble::tibble(start = 23, end = 35, match = "@atproto.com"))
expect_equal({
str_locate_all_bytes(text, regexs$url_regex)
}, tibble::tibble(start = 74, end = 108, match = "https://en.wikipedia.org/wiki/CBOR"))
})

0 comments on commit 6e1d24d

Please sign in to comment.