@@ -38,6 +38,10 @@ ragnar_retrieve_vss <- function(
3838 check_string(text )
3939 check_number_whole(top_k )
4040 method <- rlang :: arg_match(method )
41+ if (inherits(store , " tbl_sql" )) {
42+ tbl <- store
43+ return (ragnar_retrieve_vss_tbl(tbl , text , top_k , method ))
44+ }
4145
4246 cols <- names(store @ schema ) | >
4347 stringi :: stri_subset_regex(" ^embedding$" , negate = TRUE ) | >
@@ -66,22 +70,22 @@ ragnar_retrieve_vss <- function(
6670# store |> dplyr::mutate(score = calculate_vss(store, text))
6771# using dbplyr
6872calculate_vss <- function (store , text , method ) {
69- if (is.null(store @ embed )) {
73+ embed <- get_store_embed(store )
74+ if (is.null(embed )) {
7075 cli :: cli_abort(" Store must have an embed function but got {.code NULL}" )
7176 }
7277
73- embedded_text <- store @ embed(text )
78+ embedded_text <- embed(text )
7479 embedding_size <- ncol(embedded_text )
7580
7681 . [method_function , .. ] <- method_to_info(method )
77-
7882 glue :: glue(
7983 r " ---(
8084 {method_function}(
8185 embedding,
8286 [{stri_flatten(embedded_text, " , " )}]::FLOAT[{embedding_size}]
8387 )
84- )---"
88+ )---"
8589 )
8690}
8791
@@ -94,10 +98,71 @@ method_to_info <- function(method) {
9498 euclidean_distance = c(" array_distance" , " ASC" ),
9599 negative_dot_product = c(" array_negative_dot_product" , " ASC" ),
96100 cosine_similarity = c(" array_cosine_similarity" , " DESC" ),
97- dot_product = c(" array_dot_product" , " DESC" )
101+ dot_product = c(" array_dot_product" , " DESC" ),
102+ stop(" Unknown method" )
98103 )
99104}
100105
106+
107+ get_store_embed <- function (x ) {
108+ if (S7_inherits(x , RagnarStore )) {
109+ return (x @ embed )
110+ }
111+
112+ if (inherits(x , " tbl_sql" )) {
113+ con <- dbplyr :: remote_con(x )
114+ ptr <- con @ conn_ref
115+ embed <- attr(ptr , " embed_function" , exact = TRUE )
116+ if (! is.null(embed )) {
117+ return (embed )
118+ }
119+
120+ # Attribute missing: reread from db and cache on ptr
121+ embed_blob <- DBI :: dbGetQuery(
122+ con ,
123+ " SELECT embed_func FROM metadata LIMIT 1"
124+ )$ embed_func [[1 ]]
125+ embed <- unserialize(embed_blob )
126+ attr(ptr , " embed_function" ) <- embed
127+ return (embed )
128+ }
129+
130+ cli :: cli_abort(" `store` must be a RagnarStore or a dplyr::tbl()" )
131+ }
132+
133+
134+ ragnar_retrieve_vss_tbl <- function (tbl , text , top_k , method ) {
135+ . [.. , order_key ] <- method_to_info(method )
136+ tbl | >
137+ mutate(
138+ metric_value = sql(calculate_vss(tbl , text , method )),
139+ metric_name = method
140+ ) | >
141+ select(- " embedding" ) | >
142+ arrange(sql(glue(" metric_value {order_key}" ))) | >
143+ head(n = top_k ) | >
144+ collect()
145+ }
146+
147+ ragnar_retrieve_bm25_tbl_sql <- function (tbl , text , top_k ) {
148+ con <- dbplyr :: remote_con(tbl )
149+ text_quoted <- DBI :: dbQuoteString(con , text )
150+
151+ tbl | >
152+ mutate(
153+ metric_value = sql(glue :: glue(
154+ " fts_main_chunks.match_bm25(id, {text_quoted})"
155+ )),
156+ metric_name = " bm25"
157+ ) | >
158+ filter(sql(' metric_value IS NOT NULL' )) | >
159+ arrange(.data $ metric_value ) | >
160+ select(- " embedding" ) | >
161+ head(n = top_k ) | >
162+ collect()
163+ }
164+
165+
101166# ' Retrieves chunks using the BM25 score
102167# '
103168# ' BM25 refers to Okapi Best Matching 25. See \doi{10.1561/1500000019} for more information.
@@ -108,6 +173,9 @@ method_to_info <- function(method) {
108173ragnar_retrieve_bm25 <- function (store , text , top_k = 3L ) {
109174 check_string(text )
110175 check_number_whole(top_k )
176+ if (inherits(store , " tbl_sql" )) {
177+ return (ragnar_retrieve_bm25_tbl_sql(store , text , top_k ))
178+ }
111179
112180 cols <- names(store @ schema ) | >
113181 stringi :: stri_subset_regex(" ^embedding$" , negate = TRUE ) | >
@@ -168,7 +236,6 @@ ragnar_retrieve_vss_and_bm25 <- function(store, text, top_k = 3, ...) {
168236 )
169237
170238 # TODO: come up with a nice reordering that doesn't involve too much compute.
171-
172239 as_tibble(out )
173240}
174241
@@ -178,18 +245,54 @@ ragnar_retrieve_vss_and_bm25 <- function(store, text, top_k = 3, ...) {
178245# ' [ragnar_retrieve()] is a thin wrapper around [ragnar_retrieve_vss_and_bm25()]
179246# ' using the recommended best practices.
180247# '
181- # ' @param store A `RagnarStore` object.
248+ # ' @param store A `RagnarStore` object or a `dplyr::tbl()` derived from
249+ # ' it. When you pass a `tbl`, you may use usual dplyr verbs (e.g.
250+ # ' `filter()`, `slice()`) to restrict the rows examined before
251+ # ' similarity scoring. Avoid dropping essential columns such as
252+ # ' `text`, `embedding`, `origin`, and `hash`.
182253# ' @param text A string to find the nearest match too
183254# ' @param top_k Integer, the number of nearest entries to find *per method*.
184255# '
185256# ' @returns A dataframe of retrieved chunks. Each row corresponds to an
186257# ' individual chunk in the store. It always contains a column named `text`
187258# ' that contains the chunks.
188259# '
260+ # ' @section Pre-filtering with dplyr:
261+ # ' The store behaves like a lazy table backed by DuckDB, so row‑wise
262+ # ' filtering is executed directly in the database. This lets you narrow the
263+ # ' search space efficiently without pulling data into R.
264+ # '
189265# ' @family ragnar_retrieve
190266# ' @export
267+ # ' @examples
268+ # ' # Basic usage
269+ # ' mock_embed <- function(x) matrix(stats::runif(10), nrow = length(x), ncol = 10)
270+ # ' store <- ragnar_store_create(embed = mock_embed)
271+ # ' ragnar_store_insert(store, data.frame(text = c("foo", "bar")))
272+ # ' ragnar_store_build_index(store)
273+ # ' ragnar_retrieve(store, "foo")
274+ # '
275+ # ' # More Advanced: store metadata, retrieve with pre-filtering
276+ # ' store <- ragnar_store_create(
277+ # ' embed = mock_embed,
278+ # ' extra_cols = data.frame(category = character())
279+ # ' )
280+ # ' ragnar_store_insert(
281+ # ' store,
282+ # ' data.frame(
283+ # ' category = c("desert", "desert", "desert", "meal", "meal", "meal"),
284+ # ' text = c("ice cream", "cake", "cookies", "pasta", "burger", "salad")
285+ # ' )
286+ # ' )
287+ # ' ragnar_store_build_index(store)
288+ # '
289+ # ' # simple retrieve
290+ # ' ragnar_retrieve(store, "yummy")
291+ # '
292+ # ' # retrieve with pre-filtering
293+ # ' dplyr::tbl(store) |>
294+ # ' dplyr::filter(category == "meal") |>
295+ # ' ragnar_retrieve("yummy")
191296ragnar_retrieve <- function (store , text , top_k = 3L ) {
192297 ragnar_retrieve_vss_and_bm25(store , text , top_k )
193298}
194-
195- # TODO: re-ranking.
0 commit comments