tidyverse
diff --git a/‎R/chunk.R‎
Lines changed: 89 additions & 65 deletions b/‎R/chunk.R‎
Lines changed: 89 additions & 65 deletions
diff --git a/‎R/ellmer.R‎
Lines changed: 20 additions & 16 deletions b/‎R/ellmer.R‎
Lines changed: 20 additions & 16 deletions
diff --git a/‎R/embed-bedrock.R‎
Lines changed: 20 additions & 9 deletions b/‎R/embed-bedrock.R‎
Lines changed: 20 additions & 9 deletions
@@ -1,36 +1,37 @@
-
-
 pick_cut_positions <- function(candidates, chunk_size) {
-  .Call(pick_cut_positions_,
-        as.integer(candidates),
-        as.integer(chunk_size))
+  .Call(pick_cut_positions_, as.integer(candidates), as.integer(chunk_size))
 }
 
-str_chunk1 <- function(string,
-                       candidate_cutpoints,
-                       # assuming:
-                       #   1 token ~ 4 characters
-                       #   one page ~ 400 tokens
-                       #   target chunk size ~ 1 page
-                       max_size = 1600L,
-                       trim = TRUE) {
-  if(isTRUE(is.na(string)))
-    return(NA_character_)
+str_chunk1 <- function(
+  string,
+  candidate_cutpoints,
+  # assuming:
+  #   1 token ~ 4 characters
+  #   one page ~ 400 tokens
+  #   target chunk size ~ 1 page
+  max_size = 1600L,
+  trim = TRUE
+) {
+  if (isTRUE(is.na(string))) return(NA_character_)
   check_string(string, allow_na = TRUE)
   string_len <- stri_length(string)
-  if (string_len <= max_size)
-    return(string)
+  if (string_len <= max_size) return(string)
 
   candidate_cutpoints <- c(
-    1L, as.integer(candidate_cutpoints), string_len
+    1L,
+    as.integer(candidate_cutpoints),
+    string_len
   )
 
   cut_points <- pick_cut_positions(candidate_cutpoints, max_size)
-  chunks <- stri_sub(string, drop_last(cut_points), drop_first(cut_points),
-                     use_matrix = FALSE)
+  chunks <- stri_sub(
+    string,
+    drop_last(cut_points),
+    drop_first(cut_points),
+    use_matrix = FALSE
+  )
 
-  if (trim)
-    chunks <- stri_trim_both(chunks)
+  if (trim) chunks <- stri_trim_both(chunks)
 
   chunks <- chunks[nzchar(chunks)]
 
@@ -53,7 +54,9 @@ str_locate_boundaries1 <- function(string, boundary) {
       ## then split on raw vector.
       ## ... or use stringi to convert byte to char indexes, e.g.,
       ## stri_split_boundaries(x, type = "char")[[1]] |>  stri_numbytes()
-      paragraph = stri_locate_all_fixed(string, "\n\n", omit_no_match = TRUE)[[1L]][, "end"],
+      paragraph = stri_locate_all_fixed(string, "\n\n", omit_no_match = TRUE)[[
+        1L
+      ]][, "end"],
 
       # Note, stri_opts_brkiter 'type = line_break' is really about finding
       # candidates line break for the purpose of line wrapping a string, not
@@ -62,11 +65,17 @@ str_locate_boundaries1 <- function(string, boundary) {
       # stri_split_lines() does more comprehensive identification of line
       # breaks, but isn't exported as a boundary detector. Most text passing
       # through here is expected to have been normalized as markdown already...
-      line_break = stri_locate_all_fixed(string, "\n", omit_no_match = TRUE)[[1L]][, "end"],
+      line_break = stri_locate_all_fixed(string, "\n", omit_no_match = TRUE)[[
+        1L
+      ]][, "end"],
 
       sentence = ,
       word = ,
-      character = stri_locate_all_boundaries(string, type = boundary, locale = "@ss=standard")[[1L]][, "end"],
+      character = stri_locate_all_boundaries(
+        string,
+        type = boundary,
+        locale = "@ss=standard"
+      )[[1L]][, "end"],
       stop(
         'boundaries values must be one of: "paragraph", "sentence", "line_break", "word", "character" or a stringr pattern'
       )
@@ -75,11 +84,13 @@ str_locate_boundaries1 <- function(string, boundary) {
   locations
 }
 
-str_chunk <- function(x, max_size,
-                      boundaries = c("paragraph", "sentence", "line_break", "word", "character"),
-                      trim = TRUE,
-                      simplify = TRUE) {
-
+str_chunk <- function(
+  x,
+  max_size,
+  boundaries = c("paragraph", "sentence", "line_break", "word", "character"),
+  trim = TRUE,
+  simplify = TRUE
+) {
   chunk1 <- function(string, boundary) {
     str_chunk1(
       string,
@@ -96,21 +107,22 @@ str_chunk <- function(x, max_size,
     repeat {
       lens <- stri_length(chunks)
       is_over_size <- lens > max_size
-      if (!any(is_over_size, na.rm = TRUE))
-        break
+      if (!any(is_over_size, na.rm = TRUE)) break
       boundaries <- boundaries[-1L]
-      if (!length(boundaries))
-        break
+      if (!length(boundaries)) break
       chunks <- as.list(chunks)
-      chunks[is_over_size] <- lapply(chunks[is_over_size], chunk1, boundaries[[1L]])
+      chunks[is_over_size] <- lapply(
+        chunks[is_over_size],
+        chunk1,
+        boundaries[[1L]]
+      )
       chunks <- unlist(chunks)
       # TODO: recurse and returned nested list of strings if simplify=FALSE
     }
     chunks
   })
 
-  if (simplify)
-    out <- unlist(out)
+  if (simplify) out <- unlist(out)
 
   out
 }
@@ -217,9 +229,14 @@ str_chunk <- function(x, max_size,
 #' @name ragnar_chunk
 #' @rdname ragnar_chunk
 #' @export
-ragnar_chunk <- function(x, max_size = 1600L,
-                         boundaries = c("paragraph", "sentence", "line_break", "word", "character"),
-                         ..., trim = TRUE, simplify = TRUE) {
+ragnar_chunk <- function(
+  x,
+  max_size = 1600L,
+  boundaries = c("paragraph", "sentence", "line_break", "word", "character"),
+  ...,
+  trim = TRUE,
+  simplify = TRUE
+) {
   if (is.data.frame(x)) {
     check_character(x[["text"]])
     x[["text"]] <- str_chunk(
@@ -230,8 +247,7 @@ ragnar_chunk <- function(x, max_size = 1600L,
       ...,
       simplify = FALSE
     )
-    if (simplify)
-      x <- tidyr::unchop(x, "text")
+    if (simplify) x <- tidyr::unchop(x, "text")
   } else {
     boundaries <- as_boundaries_list(boundaries)
     x <- str_chunk(
@@ -248,45 +264,55 @@ ragnar_chunk <- function(x, max_size = 1600L,
 
 #' @export
 #' @rdname ragnar_chunk
-ragnar_segment <- function(x,
-                           boundaries = "sentence",
-                           ...,
-                           trim = FALSE,
-                           simplify = TRUE) {
+ragnar_segment <- function(
+  x,
+  boundaries = "sentence",
+  ...,
+  trim = FALSE,
+  simplify = TRUE
+) {
   if (is.data.frame(x)) {
     check_character(x[["text"]])
-    x[["text"]] <- ragnar_segment(x[["text"]],
-                                  boundaries = boundaries,
-                                  trim = trim,
-                                  ...,
-                                  simplify = FALSE)
-    if (simplify)
-      x <- tidyr::unchop(x, "text")
+    x[["text"]] <- ragnar_segment(
+      x[["text"]],
+      boundaries = boundaries,
+      trim = trim,
+      ...,
+      simplify = FALSE
+    )
+    if (simplify) x <- tidyr::unchop(x, "text")
     return(x)
   }
 
   boundaries <- as_boundaries_list(boundaries)
   check_character(x)
   out <- lapply(x, function(string) {
     cutpoints <- lapply(boundaries, str_locate_boundaries1, string = string) |>
-      unlist() |> c(1L, stri_length(string)) |> sort() |> unique()
+      unlist() |>
+      c(1L, stri_length(string)) |>
+      sort() |>
+      unique()
     segments <- stri_sub(string, drop_last(cutpoints), drop_first(cutpoints))
-    if (trim)
-      segments <- stri_trim_both(segments)
+    if (trim) segments <- stri_trim_both(segments)
     segments
   })
 
-  if (simplify)
-    out <- unlist(out)
+  if (simplify) out <- unlist(out)
 
   out
 }
 
 #' @export
 #' @rdname ragnar_chunk
-ragnar_chunk_segments <- function(x, max_size = 1600L, ..., simplify = TRUE, trim = TRUE) {
+ragnar_chunk_segments <- function(
+  x,
+  max_size = 1600L,
+  ...,
+  simplify = TRUE,
+  trim = TRUE
+) {
   sep <- ""
-  if(is.data.frame(x)) {
+  if (is.data.frame(x)) {
     stopifnot(is.list(x[["text"]]), all(map_chr(x[["text"]]), is.character))
     x[["text"]] <- ragnar_chunk_segments(
       x[["text"]],
@@ -296,8 +322,7 @@ ragnar_chunk_segments <- function(x, max_size = 1600L, ..., simplify = TRUE, tri
       sep = sep,
       simplify = FALSE
     )
-    if (simplify)
-      x <- tidyr::unchop(x, "text")
+    if (simplify) x <- tidyr::unchop(x, "text")
     return(x)
   }
   check_string(sep)
@@ -311,8 +336,7 @@ ragnar_chunk_segments <- function(x, max_size = 1600L, ..., simplify = TRUE, tri
         ...
       )
     })
-    if (simplify)
-      out <- unlist(out)
+    if (simplify) out <- unlist(out)
 
     return(out)
   }
 
@@ -1,4 +1,3 @@
-
 #' Register a 'retrieve' tool with ellmer
 #'
 #' @param chat a `ellmer:::Chat` object.
@@ -23,20 +22,25 @@
 #' ragnar_register_tool_retrieve(chat, store)
 #' chat$chat("How can I subset a dataframe?")
 ragnar_register_tool_retrieve <-
-function(chat, store, store_description = "the knowledge store", ...) {
-  rlang::check_installed("ellmer")
-  store; list(...)
+  function(chat, store, store_description = "the knowledge store", ...) {
+    rlang::check_installed("ellmer")
+    store
+    list(...)
 
-  chat$register_tool(
-    ellmer::tool(
-      .name = glue::glue("rag_retrieve_from_{store@name}"),
-      function(text) {
-        ragnar_retrieve(store, text, ...)$text |>
-          stringi::stri_flatten("\n\n---\n\n")
-      },
-      glue::glue("Given a string, retrieve the most relevent excerpts from {store_description}."),
-      text = ellmer::type_string("The text to find the most relevent matches for.")
+    chat$register_tool(
+      ellmer::tool(
+        .name = glue::glue("rag_retrieve_from_{store@name}"),
+        function(text) {
+          ragnar_retrieve(store, text, ...)$text |>
+            stringi::stri_flatten("\n\n---\n\n")
+        },
+        glue::glue(
+          "Given a string, retrieve the most relevent excerpts from {store_description}."
+        ),
+        text = ellmer::type_string(
+          "The text to find the most relevent matches for."
+        )
+      )
     )
-  )
-  invisible(chat)
-}
+    invisible(chat)
+  }
@@ -1,4 +1,3 @@
-
 #' Embed text using a Bedrock model
 #'
 #' @inheritParams embed_ollama
@@ -20,7 +19,6 @@
 #'
 #' @export
 embed_bedrock <- function(x, model, profile, api_args = list()) {
-
   if (missing(x) || is.null(x)) {
     args <- capture_args()
     fn <- partial(quote(ragnar::embed_bedrock), alist(x = ), args)
@@ -49,7 +47,9 @@ embed_bedrock <- function(x, model, profile, api_args = list()) {
   }
 
   req <- httr2::request(paste0(
-    "https://bedrock-runtime.", credentials$region, ".amazonaws.com"
+    "https://bedrock-runtime.",
+    credentials$region,
+    ".amazonaws.com"
   ))
 
   req <- httr2::req_url_path_append(
@@ -99,7 +99,6 @@ embed_bedrock_cohere <- function(base_req, inputs, api_args, req_auth_bedrock) {
 
   out <- list()
   for (indices in chunk_list(seq_along(inputs), 20)) {
-
     body <- rlang::list2(
       texts = as.list(inputs[indices]),
       !!!api_args
@@ -113,7 +112,12 @@ embed_bedrock_cohere <- function(base_req, inputs, api_args, req_auth_bedrock) {
     out[indices] <- httr2::resp_body_json(resp)$embeddings
   }
 
-  matrix(unlist(out), nrow = length(inputs), ncol = length(out[[1]]), byrow = TRUE)
+  matrix(
+    unlist(out),
+    nrow = length(inputs),
+    ncol = length(out[[1]]),
+    byrow = TRUE
+  )
 }
 
 
@@ -140,7 +144,12 @@ embed_bedrock_titan <- function(base_req, inputs, api_args, req_auth_bedrock) {
     httr2::resp_body_json(resp)$embedding
   })
 
-  matrix(unlist(out), nrow = length(inputs), ncol = length(out[[1]]), byrow = TRUE)
+  matrix(
+    unlist(out),
+    nrow = length(inputs),
+    ncol = length(out[[1]]),
+    byrow = TRUE
+  )
 }
 
 chunk_list <- function(lst, n) {
@@ -149,8 +158,11 @@ chunk_list <- function(lst, n) {
 
 # Helpers ---------------------------------------------------------------------
 
-paws_credentials <- function(profile, cache = aws_creds_cache(profile),
-                             reauth = FALSE) {
+paws_credentials <- function(
+  profile,
+  cache = aws_creds_cache(profile),
+  reauth = FALSE
+) {
   creds <- cache$get()
   if (reauth || is.null(creds) || creds$expiration < Sys.time()) {
     cache$clear()
@@ -189,4 +201,3 @@ credentials_cache <- function(key) {
     clear = function() env_unbind(the$credentials_cache, key)
   )
 }
-