From 6e1d24db3eec654a6a8f9490db842f9783aa83a0 Mon Sep 17 00:00:00 2001 From: JBGruber Date: Sun, 22 Oct 2023 22:09:21 +0200 Subject: [PATCH] cleaning up and testing str_locate_all_bytes --- R/data.R | 12 ++++++++++++ R/feed.r | 14 ++++++-------- R/utils.r | 4 ++-- data-raw/regexs.R | 10 ++++++++++ data/regexs.rda | Bin 0 -> 270 bytes man/regexs.Rd | 26 ++++++++++++++++++++++++++ tests/testthat/test-utils.R | 12 ++++++++++++ 7 files changed, 68 insertions(+), 10 deletions(-) create mode 100644 data-raw/regexs.R create mode 100644 data/regexs.rda create mode 100644 man/regexs.Rd diff --git a/R/data.R b/R/data.R index 0891855..50d7d4d 100644 --- a/R/data.R +++ b/R/data.R @@ -10,3 +10,15 @@ #' } #' @source "list_lexicons" + + +#' Regular expressions to match mentions and URLs +#' +#' @format ## `regexs` +#' A list with regular expressions to match: +#' \describe{ +#' \item{mention_regex}{Mentions of user handles like @atproto.com} +#' \item{url_regex}{URLs} +#' } +#' @source +"regexs" diff --git a/R/feed.r b/R/feed.r index 4be8e1c..6cea114 100644 --- a/R/feed.r +++ b/R/feed.r @@ -536,27 +536,25 @@ get_feed_likes <- function(feed_url, parse_facets <- function(text) { facets <- list() - mention_regex <- "(?<=[$|\\W])(@([a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\\.)+[a-zA-Z]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)" - mentions <- str_locate_all_bytes(text, mention_regex) + mentions <- str_locate_all_bytes(text, regexs$mention_regex) + mentions$match <- stringr::str_remove(mentions$match, "@") facets <- purrr::pmap(mentions, function(start, end, match) { - handle <- stringr::str_remove(match, "@") - did <- do.call(com_atproto_identity_resolve_handle, list(handle = handle)) |> + did <- do.call(com_atproto_identity_resolve_handle, list(handle = match)) |> purrr::pluck("did") list( - index = list(byteStart = start - 1, byteEnd = end - 1), + index = list(byteStart = start, byteEnd = end), features = list(list("$type" = "app.bsky.richtext.facet#mention", "did" = did)) ) }) |> append(facets) - url_regex <- "(?<=[$|\\W])(https?:\\/\\/(www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b([-a-zA-Z0-9()@:%_\\+.~#?&//=]*[-a-zA-Z0-9@%_\\+~#//=])?)" - urls <- str_locate_all_bytes(text, url_regex) + urls <- str_locate_all_bytes(text, regexs$url_regex) facets <- purrr::pmap(urls, function(start, end, match) { list( - index = list(byteStart = start - 1, byteEnd = end - 1), + index = list(byteStart = start, byteEnd = end), features = list(list("$type" = "app.bsky.richtext.facet#link", "uri" = match)) ) }) |> diff --git a/R/utils.r b/R/utils.r index 6cf869c..aaf9311 100644 --- a/R/utils.r +++ b/R/utils.r @@ -246,11 +246,11 @@ str_locate_all_bytes <- function(string, pattern) { if (nrow(spans) > 0) { # add matched text before shifting locations - spans$match <- substr(string, spans$start, spans$end) + spans$match <- substr(string, spans$start + 1, spans$end) # shift locations using byte lengths for (i in seq_along(spans$start)) { spans$start[i] <- sum(byte_len$b_len[1:spans$start[i]]) - spans$end[i] = sum(byte_len$b_len[1:spans$end[i]]) + 1 + spans$end[i] = sum(byte_len$b_len[1:spans$end[i]]) } } return(spans) diff --git a/data-raw/regexs.R b/data-raw/regexs.R new file mode 100644 index 0000000..83a7387 --- /dev/null +++ b/data-raw/regexs.R @@ -0,0 +1,10 @@ +## code to prepare `regexs` dataset goes here +## copied and adapted from https://atproto.com/blog/create-post#mentions-and-links +regexs <- list( + # original: rb"[$|\W](@([a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)" + mention_regex = "[$|\\W]@(([a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\\.)+[a-zA-Z]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)", + # original: rb"[$|\W](https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*[-a-zA-Z0-9@%_\+~#//=])?)" + url_regex = "[$|\\W](https?:\\/\\/(www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b([-a-zA-Z0-9()@:%_\\+.~#?&//=]*[-a-zA-Z0-9@%_\\+~#//=])?)" +) + +usethis::use_data(regexs, overwrite = TRUE) diff --git a/data/regexs.rda b/data/regexs.rda new file mode 100644 index 0000000000000000000000000000000000000000..b75830dd5f28b6d856c66b33384437ddafa2615b GIT binary patch literal 270 zcmV+p0rCDqT4*^jL0KkKSq<_6R{#Kh|G@t;NB|Fi^D^WB5Z9-wKafB*slgJz}_ z0&I3zdHkX!(t;oxGYMq@V1pGw8356P1wrO8K+k!B6Y6RjIm?uW0`j&n*M(Su!BIyz zRY1}qES!*tMn<`f%RqbC3IdOo^xRSPeE@hnTv-i>OkZ%(Fd-aQhPdql3wtDK%)u-_ z+?k@2aV#;Fxeh}#Bq7ZL=ik)z5rYGZ4^C2r5{cRwQ}d~*+JH*7RJ63Yp$Vv1K@I>c Ud=d?U3}yUX$rRy2KsU$ + expect_equal({ + str_locate_all_bytes(text, regexs$mention_regex) + }, tibble::tibble(start = 23, end = 35, match = "@atproto.com")) + expect_equal({ + str_locate_all_bytes(text, regexs$url_regex) + }, tibble::tibble(start = 74, end = 108, match = "https://en.wikipedia.org/wiki/CBOR")) +})