concordance_naive <- function(x, y) {
dx <- sign(outer(x, x, `-`))
dy <- sign(outer(y, y, `-`))
.rowSums(dx * dy, nrow(dx), ncol(dx))
}Fast Concordance Vector in R
1. The problem
We want, for each observation \(i\), the sum
\[ c_i = \sum_{j \ne i} \operatorname{sign}(x_j - x_i)\,\operatorname{sign}(y_j - y_i), \]
where sign is the usual sign function. This is essentially a per-observation contribution to Kendall’s tau.
Naively, this is \(\mathcal{O}(n^2)\), which is infeasible for large \(n\).
2. Naive R solution
The naive R implementation uses outer to compute all pairwise differences:
This is simple, and for small \(n\) it is actually very fast because everything is vectorized. However, both dx and dy are \(n \times n\) matrices, so this is \(\mathcal{O}(n^2)\) in memory and time.
2b. Naive Rcpp solution
A straightforward Rcpp solution is:
Rcpp::sourceCpp(code = r"{
#include <Rcpp.h>
using namespace Rcpp;
int sign(double x){
return x == 0 ? 0 : x > 0 ? 1 : -1;
}
// Defined in Hollander, Wolfe & Chicken, Nonparametric Statistical Methods, 3ed., eq. (8.5), p. 394
int Q(double a, double b, double c, double d) {
return sign((d-b)*(c-a));
}
// Defined in Hollander, Wolfe & Chicken, Nonparametric Statistical Methods, 3ed., eq. (8.37), p. 415
// [[Rcpp::export]]
IntegerVector concordanceVector_cpp(const NumericVector& x, const NumericVector& y){
int n = x.size();
IntegerVector out(n);
for(int i = 0; i < n; i++){
out[i] = 0;
for(int t = 0; t < n; t++){
if(i != t) out[i] += Q(x[i], y[i], x[t], y[t]);
}
}
return out;
}
}"
)3. Optimized solution (Fenwick Tree)
The key optimization is to avoid the \(n \times n\) matrix entirely. Steps:
- Sort points by
x. Then \(\operatorname{sign}(x_j - x_i)\) is determined by index order, i.e., \(\operatorname{sign}(x_j - x_i) = \operatorname{sign}(j - i)\). - Replace
ywith integer ranks. - Use a Fenwick Tree (Binary Indexed Tree) to efficiently count how many larger/smaller
yvalues exist to the left and right of each index. - Combine these counts to get the concordance vector in \(O(n \log n)\).
This is the same trick used in fast Kendall’s tau algorithms, but adapted to return per-observation contributions.
4. Implementation of the optimized solution
The Fenwick tree algorithm is implemented in C++ using cpp11 in src/concordance.cpp and registered as concordance_fenwick_cpp. For reference, here is the equivalent pure-R implementation:
concordance_fenwick_cpp <- jaspRegression:::concordance_fenwick_cpp
# Pure-R reference implementation (kept for documentation)
concordance_fenwick_inplace <- function(x, y) {
n <- length(x)
stopifnot(length(y) == n)
if (n == 0L) return(integer(0L))
if (n == 1L) return(0L)
# 1) order by x (stable), keep x-ordered copies
ord <- order(x, y, method = "radix")
xo <- x[ord]
yo <- y[ord]
# 2) compute group runs for equal x (use rle to avoid building big lists)
if (anyDuplicated(xo)) {
rle_x <- rle(xo)
group_lengths <- rle_x$lengths
group_starts <- cumsum(c(1L, head(group_lengths, -1L)))
group_ends <- cumsum(group_lengths)
G <- length(group_lengths) # number of groups
} else {
# fast path
rle_x <- rle(xo)
group_lengths <- rep.int(1L, n)
group_starts <- 1:n
group_ends <- 1:n
G <- n # number of groups
}
# 3) compress y to ranks 1..m (strict ordering of unique y's)
uniq_y <- sort(unique(yo))
ry <- match(yo, uniq_y) # integer vector 1..m
m <- length(uniq_y)
# allocate bit and result vectors (integers)
bit <- integer(m) # 1-indexed BIT (positions 1..m)
less_right <- integer(n)
greater_right <- integer(n)
# originally there were separate vectors, but we can reuse some memory
# less_left <- integer(n)
# greater_left <- integer(n)
# precompute the results of bitwAnd so we can just do lookup
# lowbits <- (1:m) - ((1:m) & ((1:m) - 1L))
# lowbits <- (1:m) - bitwAnd((1:m), ((1:m) - 1L))
#lowbits <- vapply(1:m, \(i) bitwAnd(i, -i), 0L)# -> lowbits
lowbits <- bitwAnd(1:m, -1:-m)
# Helper: inline bit_sum and bit_add are implemented as loops below
# -------- Right sweep (groups processed from right to left) ----------
total_seen <- 0L
for (g in G:1L) {
i1 <- group_starts[g]
i2 <- group_ends[g]
# query each index in this group (do NOT add them yet)
for (i in i1:i2) {
r <- ry[i]
# less_right: bit_sum(r-1)
j <- r - 1L
s_less <- 0L
while (j > 0L) {
s_less <- s_less + bit[j]
j <- j - lowbits[j] #bitwAnd(j, -j)
}
# leq_count: bit_sum(r)
j <- r
s_leq <- 0L
while (j > 0L) {
s_leq <- s_leq + bit[j]
j <- j - lowbits[j] #bitwAnd(j, -j)
}
less_right[i] <- s_less
greater_right[i] <- total_seen - s_leq
}
# now add this entire group into the BIT (so earlier groups see them)
for (i in i1:i2) {
r <- ry[i]
j <- r
while (j <= m) {
bit[j] <- bit[j] + 1L
j <- j + lowbits[j] #bitwAnd(j, -j)
}
total_seen <- total_seen + 1L
}
}
out_xorder <- (greater_right - less_right) # partial result
# reset for left sweep, these are now less_left and greater_left
less_right[] <- 0L
greater_right[] <- 0L
# -------- Left sweep (groups processed from left to right) ----------
bit[] <- 0L # reset BIT in-place (no new allocation)
total_seen <- 0L
for (g in 1L:G) {
i1 <- group_starts[g]
i2 <- group_ends[g]
# query each index in this group (do NOT add them yet)
for (i in i1:i2) {
r <- ry[i]
# less_left: bit_sum(r-1)
j <- r - 1L
s_less <- 0L
while (j > 0L) {
s_less <- s_less + bit[j]
j <- j - lowbits[j] #bitwAnd(j, -j)
}
# leq_count: bit_sum(r)
j <- r
s_leq <- 0L
while (j > 0L) {
s_leq <- s_leq + bit[j]
j <- j - lowbits[j] #bitwAnd(j, -j)
}
# less_left[i] <- s_less
# greater_left[i] <- total_seen - s_leq
less_right[i] <- s_less
greater_right[i] <- total_seen - s_leq
}
# now add this group's ranks to BIT
for (i in i1:i2) {
r <- ry[i]
j <- r
while (j <= m) {
bit[j] <- bit[j] + 1L
j <- j + lowbits[j] #bitwAnd(j, -j)
}
total_seen <- total_seen + 1L
}
}
out_xorder <- out_xorder + (less_right - greater_right)
# reuse some memory
less_right[] <- 0L
less_right[ord] <- out_xorder
return(less_right)
# combine contributions (in x-sorted order), then restore original order
# out_xorder <- (greater_right - less_right) + (less_left - greater_left)
# out <- integer(n)
# out[ord] <- out_xorder
# out
}5. Benchmarking
We compare the naive and optimized versions using the bench package.
concordance_naive_cpp <- function(x, y) jaspRegression:::concordance_naive_cpp(as.double(x), as.double(y))
concordance_fenwick_cpp <- function(x, y) jaspRegression:::concordance_fenwick_cpp(as.double(x), as.double(y))
# results with ties
x <- sample(-5:5, 100, TRUE)
y <- sample(-5:5, 100, TRUE)
naive = concordance_naive(x, y)
naive_cpp = concordance_naive_cpp(x, y)
fenwick = concordance_fenwick_cpp(x, y)
all(naive == fenwick) && all(naive_cpp == fenwick)[1] TRUE
# results without ties
x <- rnorm(10)
y <- rnorm(10)
naive = concordance_naive(x, y)
naive_cpp = concordance_naive_cpp(x, y)
fenwick = concordance_fenwick_cpp(x, y)
all(naive == fenwick) && all(naive_cpp == fenwick)[1] TRUE
orders <- 3
ns <- c(sapply(seq_len(orders), \(o) {
from <- 10^o
to <- 10^(o+1) - from
seq(from, to, by = from) # e.g., 10, 100 - 10, 10
}), 10^(orders+1))
res <- suppressWarnings(bench::press(
n = ns,
{
x <- rnorm(n)
y <- rnorm(n)
bench::mark(
naive = concordance_naive(x, y),
naive_cpp = concordance_naive_cpp(x, y),
fenwick = concordance_fenwick_cpp(x, y),
check = TRUE, # ensure results are checked for equality
iterations = 3
)
},
.quiet = !interactive()
)) # suppressWarnings to hide the message "Some expressions had a GC in every iteration; so filtering is disabled."
res# A tibble: 84 × 7
expression n min median `itr/sec` mem_alloc `gc/sec`
<bch:expr> <dbl> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
1 naive 10 14.85µs 16.59µs 52159. 5.8KB 0
2 naive_cpp 10 1.88µs 2.48µs 332664. 0B 0
3 fenwick 10 2.14µs 3.05µs 248710. 0B 0
4 naive 20 35.9µs 40.02µs 25825. 22.4KB 0
5 naive_cpp 20 2.76µs 3.48µs 244976. 0B 0
6 fenwick 20 2.64µs 3.86µs 213717. 0B 0
7 naive 30 36.5µs 39µs 25039. 49.8KB 0
8 naive_cpp 30 3.94µs 5.01µs 191579. 0B 0
9 fenwick 30 3.1µs 4.63µs 193920. 0B 0
10 naive 40 55.7µs 56.01µs 17181. 88.6KB 0
# ℹ 74 more rows
6. Plot of runtime
res$algorithm <- as.character(res$expression)
idx_bad <- res$mem_alloc == 0
res$mem_alloc[idx_bad] <- min(res$mem_alloc[!idx_bad]) / 100 # to avoid divide by 0
lm_res <- res |>
group_by(algorithm) |>
summarize(
# Fit the model once and extract coefficients
time_fit = list(lm(log10(min) ~ log10(n))),
mem_fit = list(lm(log10(mem_alloc) ~ log10(n))),
.groups = "drop"
) |>
mutate(
time_intercept = unname(sapply(time_fit, \(x) coef(x)["(Intercept)"])),
time_slope = unname(sapply(time_fit, \(x) coef(x)["log10(n)"])),
mem_intercept = unname(sapply(mem_fit, \(x) coef(x)["(Intercept)"])),
mem_slope = unname(sapply(mem_fit, \(x) coef(x)["log10(n)"])),
# Remove the temporary columns if you don't need them
.keep = "unused"
)
idx <- match(c("fenwick", "naive", "naive_cpp"), lm_res$algorithm)
i1 <- which(lm_res$algorithm == "fenwick")
i2 <- seq_along(lm_res$algorithm)[-i1]
intersection_n_time <- 10^(with(lm_res, (time_intercept[i1] - time_intercept[i2]) / (time_slope[i2] - time_slope[i1])))
intersection_n_mem <- 10^(with(lm_res, (mem_intercept[i1] - mem_intercept[i2]) / (mem_slope[i2] - mem_slope[i1])))
intersection_y_time <- 10^(with(lm_res, (lm_res$time_intercept[i1] + lm_res$time_slope[i1] * log10(intersection_n_time))))
intersection_y_mem <- 10^(with(lm_res, (lm_res$mem_intercept[i1] + lm_res$mem_slope[i1] * log10(intersection_n_mem))))
intersection_n_time_rounded <- round(intersection_n_time)
# df_intersect <- data.frame(
# n_time = c(intersection_n_time, intersection_n_time),
# n_mem = c(intersection_n_mem, intersection_n_mem),
# y_time = c(min(res$min), as_bench_time(intersection_y_time)),
# y_mem = c(min(res$mem_alloc), as_bench_time(intersection_y_mem))
# )
idx <- match(c("fenwick", "naive"), lm_res$algorithm)
ref_lines <- data.frame(
order = c("O(n)", "O(n^2)"),
slope = c(1, 2), # Reference slopes for O(n) and O(n^2)
time_intercept = lm_res$time_intercept[idx],
mem_intercept = lm_res$mem_intercept[idx]
)
plt_time <- ggplot(res, aes(x = n, y = min, group = algorithm, color = algorithm)) +
# scale_y_log10() +
geom_point() +
# geom_line(alpha = .4, linetype = "dashed") +
scale_x_log10() +
# geom_abline(data = lm_res, mapping = aes(slope = time_slope, intercept = time_intercept, color = algorithm), linetype = "dashed") +
geom_abline(data = ref_lines, mapping = aes(slope = slope, intercept = time_intercept, color = order), linetype = "dotted") +
# geom_line(data = df_intersect, mapping = aes(x = n_time, y = y_time), alpha = .5, color = "grey", inherit.aes = FALSE) +
labs(y = "Minimum runtime") +
ggtitle("Time")
plt_memory <- ggplot(res, aes(x = n, y = mem_alloc, group = algorithm, color = algorithm)) +
# scale_y_log10() +
geom_point() +
# geom_line(alpha = .4, linetype = "dashed") +
scale_x_log10() +
# geom_abline(data = lm_res, mapping = aes(slope = mem_slope, intercept = mem_intercept, color = algorithm), linetype = "dashed") +
geom_abline(data = ref_lines, mapping = aes(slope = slope, intercept = mem_intercept, color = order), linetype = "dotted") +
# not meaningul, intersection is before n = 10
# geom_line(data = df_intersect, mapping = aes(x = n_mem, y = y_mem), alpha = .5, color = "grey", inherit.aes = FALSE) +
labs(y = "Allocated Memory") +
ggtitle("Memory")
# plotly::subplot(plotly::ggplotly(plt_time), plotly::ggplotly(plt_memory), nrows = 1)
plot(plt_time + plt_memory + plot_layout(guides = "collect"))7. Conclusions
- On my machine, the Fenwick C++ approach outperforms the naive R approach around \(n=\) 2 and the naive C++ approach around \(n=\) 21. Theoretically, the naive approaches are \(\mathcal{O}(n^2)\) in time, while the Fenwick approach is \(\mathcal{O}(n \log n)\).
- The naive R approach always uses more memory than the Fenwick and naive C++ approaches. Theoretically, the naive R approach is \(\mathcal{O}(n^2)\) in memory, while the Fenwick and naive C++ approaches are \(\mathcal{O}(n)\).
- All C++ implementations use
cpp11(no Rcpp dependency) for faster compilation.