Fast and simple url parser for R. Initially developed for the
paws.common
package.
::url_parse("https://user:pass@host.com:8000/path?query=1#fragment")
urlparse#> $scheme
#> [1] "https"
#>
#> $user
#> [1] "user"
#>
#> $password
#> [1] "pass"
#>
#> $host
#> [1] "host.com"
#>
#> $port
#> [1] "8000"
#>
#> $path
#> [1] "/path"
#>
#> $raw_path
#> [1] ""
#>
#> $query
#> $query$query
#> [1] "1"
#>
#>
#> $raw_query
#> [1] "query=1"
#>
#> $fragment
#> [1] "fragment"
You can install the development version of urlparse like so:
::install_github("dyfanjones/urlparse") remotes
r-universe installation:
install.packages("urlparse", repos = c("https://dyfanjones.r-universe.dev", "https://cloud.r-project.org"))
This is a basic example which shows you how to solve a common problem:
library(urlparse)
url_encoder("foo = bar + 5")
#> [1] "foo%20%3D%20bar%20%2B%205"
url_decoder(url_encoder("foo = bar + 5"))
#> [1] "foo = bar + 5"
Similar to python’s from urllib.parse import quote
,
urlparse::url_encoder
supports the safe
parameter. The additional ASCII characters that should not be
encoded.
from urllib.parse import quote
"foo = bar + 5", safe = "+")
quote(#> 'foo%20%3D%20bar%20+%205'
url_encoder("foo = bar + 5", safe = "+")
#> [1] "foo%20%3D%20bar%20+%205"
Modify an url
through piping using the
set_*
functions or using the stand alone
url_modify
function.
<- "http://example.com"
url set_scheme(url, "https") |>
set_port(1234L) |>
set_path("foo/bar") |>
set_query("baz") |>
set_fragment("quux")
#> [1] "https://example.com:1234/foo/bar?baz#quux"
url_modify(url, scheme = "https", port = 1234, path = "foo/bar", query = "baz", fragment = "quux")
#> [1] "https://example.com:1234/foo/bar?baz#quux"
Note: it is faster to use url_modify
rather than piping
the set_*
functions. This is because urlparse
has to parse the url within each set_*
to modify the
url.
<- "http://example.com"
url ::mark(
benchpiping = {set_scheme(url, "https") |>
set_port(1234L) |>
set_path("foo/bar") |>
set_query("baz") |>
set_fragment("quux")},
single_function = url_modify(url, scheme = "https", port = 1234, path = "foo/bar", query = "baz", fragment = "quux")
)#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 piping 5.29µs 5.9µs 162854. 0B 16.3
#> 2 single_function 1.6µs 1.8µs 517151. 0B 51.7
<- "https://user:pass@host.com:8000/path?query=1#fragment"
url <- bench::mark(
(bm urlparse = urlparse::url_parse(url),
httr2 = httr2::url_parse(url),
curl = curl::curl_parse_url(url),
urltools = urltools::url_parse(url),
check = F
))#> # A tibble: 4 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 urlparse 1.72µs 1.93µs 488712. 0B 0
#> 2 httr2 22.39µs 23.86µs 40406. 571.07KB 24.3
#> 3 curl 27.06µs 29.64µs 29324. 0B 14.7
#> 4 urltools 124.44µs 132.68µs 7119. 2.18MB 23.2
show_relative(bm)
#> # A tibble: 4 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 urlparse 1 1 68.6 NaN NaN
#> 2 httr2 13.0 12.4 5.68 Inf Inf
#> 3 curl 15.7 15.4 4.12 NaN Inf
#> 4 urltools 72.3 68.9 1 Inf Inf
::autoplot(bm) ggplot2
Since urlpase v0.1.999+
you can use the vectorised url
parser url_parser_v2
<- c(
urls "https://www.example.com",
"https://www.google.com/maps/place/Pennsylvania+Station/@40.7519848,-74.0015045,14.7z/data=!4m5!3m4!1s0x89c259ae15b2adcb:0x7955420634fd7eba!8m2!3d40.750568!4d-73.993519",
"https://user_1:password_1@example.org:8080/dir/../api?q=1#frag",
"https://user:password@example.com",
"https://www.example.com:8080/search%3D1%2B3",
"https://www.google.co.jp/search?q=\u30c9\u30a4\u30c4",
"https://www.example.com:8080?var1=foo&var2=ba%20r&var3=baz+larry",
"https://user:password@example.com:8080",
"https://user:password@example.com",
"https://user@example.com:8080",
"https://user@example.com"
)<- bench::mark(
(bm urlparse = lapply(urls, urlparse::url_parse),
urlparse_v2 = urlparse::url_parse_v2(urls),
httr2 = lapply(urls, httr2::url_parse),
curl = lapply(urls, curl::curl_parse_url),
urltools = urltools::url_parse(urls),
check = F
))#> # A tibble: 5 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 urlparse 19.4µs 21.3µs 46214. 200B 13.9
#> 2 urlparse_v2 10.5µs 11µs 87963. 4.3KB 0
#> 3 httr2 171.6µs 181.6µs 5232. 0B 10.2
#> 4 curl 188.7µs 198.4µs 4895. 0B 8.14
#> 5 urltools 130µs 142.1µs 6569. 0B 10.2
show_relative(bm)
#> # A tibble: 5 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 urlparse 1.85 1.94 9.44 Inf Inf
#> 2 urlparse_v2 1 1 18.0 Inf NaN
#> 3 httr2 16.4 16.5 1.07 NaN Inf
#> 4 curl 18.0 18.1 1 NaN Inf
#> 5 urltools 12.4 12.9 1.34 NaN Inf
::autoplot(bm) ggplot2
Note: url_parse_v2
returns the parsed url as a
data.frame
this is similar behaviour to
urltools
and adaR
:
::url_parse_v2(urls)
urlparse#> href
#> 1 https://www.example.com
#> 2 https://www.google.com/maps/place/Pennsylvania+Station/@40.7519848,-74.0015045,14.7z/data=!4m5!3m4!1s0x89c259ae15b2adcb:0x7955420634fd7eba!8m2!3d40.750568!4d-73.993519
#> 3 https://user_1:password_1@example.org:8080/dir/../api?q=1#frag
#> 4 https://user:password@example.com
#> 5 https://www.example.com:8080/search%3D1%2B3
#> 6 https://www.google.co.jp/search?q=ドイツ
#> 7 https://www.example.com:8080?var1=foo&var2=ba%20r&var3=baz+larry
#> 8 https://user:password@example.com:8080
#> 9 https://user:password@example.com
#> 10 https://user@example.com:8080
#> 11 https://user@example.com
#> scheme user password host port
#> 1 https www.example.com
#> 2 https www.google.com
#> 3 https user_1 password_1 example.org 8080
#> 4 https user password example.com
#> 5 https www.example.com 8080
#> 6 https www.google.co.jp
#> 7 https www.example.com 8080
#> 8 https user password example.com 8080
#> 9 https user password example.com
#> 10 https user example.com 8080
#> 11 https user example.com
#> path
#> 1
#> 2 /maps/place/Pennsylvania+Station/@40.7519848,-74.0015045,14.7z/data=!4m5!3m4!1s0x89c259ae15b2adcb:0x7955420634fd7eba!8m2!3d40.750568!4d-73.993519
#> 3 /dir/../api
#> 4
#> 5 /search=1+3
#> 6 /search
#> 7
#> 8
#> 9
#> 10
#> 11
#> raw_path
#> 1
#> 2 /maps/place/Pennsylvania+Station/@40.7519848,-74.0015045,14.7z/data=!4m5!3m4!1s0x89c259ae15b2adcb:0x7955420634fd7eba!8m2!3d40.750568!4d-73.993519
#> 3
#> 4
#> 5 /search%3D1%2B3
#> 6
#> 7
#> 8
#> 9
#> 10
#> 11
#> raw_query fragment
#> 1
#> 2
#> 3 q=1 frag
#> 4
#> 5
#> 6 q=%E3%83%89%E3%82%A4%E3%83%84
#> 7 var1=foo&var2=ba%20r&var3=baz%2Blarry
#> 8
#> 9
#> 10
#> 11
Note: urltools
encode special characters to lower case
hex i.e.: “?” -> “%3f” instead of “%3F”
<- "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-._~`!@#$%^&*()=+[{]}\\|;:'\",<>/? "
string <- bench::mark(
(bm urlparse = urlparse::url_encoder(string),
curl = curl::curl_escape(string),
urltools = urltools::url_encode(string),
base = URLencode(string, reserved = T),
check = F
))#> # A tibble: 4 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 urlparse 1.48µs 1.64µs 581380. 208B 0
#> 2 curl 2.25µs 2.58µs 349595. 3.03KB 0
#> 3 urltools 2.34µs 2.54µs 381930. 2.48KB 0
#> 4 base 78.84µs 82.33µs 11746. 28.59KB 8.25
show_relative(bm)
#> # A tibble: 4 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 urlparse 1 1 49.5 1 NaN
#> 2 curl 1.53 1.57 29.8 14.9 NaN
#> 3 urltools 1.58 1.55 32.5 12.2 NaN
#> 4 base 53.4 50.2 1 141. Inf
::autoplot(bm) ggplot2
<- "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-._~`!@#$%^&*()=+[{]}\\|;:'\",<>/? "
string <- paste0(sample(strsplit(string, "")[[1]], 1e4, replace = TRUE), collapse = "")
url <- bench::mark(
(bm urlparse = urlparse::url_encoder(url),
curl = curl::curl_escape(url),
urltools = urltools::url_encode(url),
base = URLencode(url, reserved = T, repeated = T),
check = F,
filter_gc = F
))#> # A tibble: 4 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 urlparse 86.4µs 88.3µs 10599. 15.8KB 0
#> 2 curl 91.6µs 94.9µs 10306. 0B 0
#> 3 urltools 241.8µs 247.8µs 3943. 15.8KB 0
#> 4 base 6.7ms 7ms 138. 333.5KB 8.00
show_relative(bm)
#> # A tibble: 4 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 urlparse 1 1 76.8 Inf NaN
#> 2 curl 1.06 1.08 74.7 NaN NaN
#> 3 urltools 2.80 2.81 28.6 Inf NaN
#> 4 base 77.6 79.3 1 Inf Inf
::autoplot(bm) ggplot2