Fast Concordance Vector in R

1. The problem

We want, for each observation \(i\), the sum

\[ c_i = \sum_{j \ne i} \operatorname{sign}(x_j - x_i)\,\operatorname{sign}(y_j - y_i), \]

where sign is the usual sign function. This is essentially a per-observation contribution to Kendall’s tau.

Naively, this is \(\mathcal{O}(n^2)\), which is infeasible for large \(n\).

2. Naive R solution

The naive R implementation uses outer to compute all pairwise differences:

concordance_naive <- function(x, y) {
  dx <- sign(outer(x, x, `-`))
  dy <- sign(outer(y, y, `-`))
  .rowSums(dx * dy, nrow(dx), ncol(dx))
}

This is simple, and for small \(n\) it is actually very fast because everything is vectorized. However, both dx and dy are \(n \times n\) matrices, so this is \(\mathcal{O}(n^2)\) in memory and time.

2b. Naive Rcpp solution

A straightforward Rcpp solution is:

Rcpp::sourceCpp(code = r"{

#include <Rcpp.h>
using namespace Rcpp;

int sign(double x){
    return x == 0 ? 0 : x > 0 ? 1 : -1;
}

// Defined in Hollander, Wolfe & Chicken, Nonparametric Statistical Methods, 3ed., eq. (8.5), p. 394
int Q(double a, double b, double c, double d) {
    return sign((d-b)*(c-a));
}

// Defined in Hollander, Wolfe & Chicken, Nonparametric Statistical Methods, 3ed., eq. (8.37), p. 415
// [[Rcpp::export]]
IntegerVector concordanceVector_cpp(const NumericVector& x, const NumericVector& y){
    int n = x.size();
    IntegerVector out(n);

    for(int i = 0; i < n; i++){
        out[i] = 0;
        for(int t = 0; t < n; t++){
            if(i != t) out[i] += Q(x[i], y[i], x[t], y[t]);
        }
    }

    return out;
}
}"
)

3. Optimized solution (Fenwick Tree)

The key optimization is to avoid the \(n \times n\) matrix entirely. Steps:

Sort points by x. Then \(\operatorname{sign}(x_j - x_i)\) is determined by index order, i.e., \(\operatorname{sign}(x_j - x_i) = \operatorname{sign}(j - i)\).
Replace y with integer ranks.
Use a Fenwick Tree (Binary Indexed Tree) to efficiently count how many larger/smaller y values exist to the left and right of each index.
Combine these counts to get the concordance vector in \(O(n \log n)\).

This is the same trick used in fast Kendall’s tau algorithms, but adapted to return per-observation contributions.

4. Implementation of the optimized solution

The Fenwick tree algorithm is implemented in C++ using cpp11 in src/concordance.cpp and registered as concordance_fenwick_cpp. For reference, here is the equivalent pure-R implementation:

concordance_fenwick_cpp <- jaspRegression:::concordance_fenwick_cpp

# Pure-R reference implementation (kept for documentation)
concordance_fenwick_inplace <- function(x, y) {
  n <- length(x)
  stopifnot(length(y) == n)
  if (n == 0L) return(integer(0L))
  if (n == 1L) return(0L)

  # 1) order by x (stable), keep x-ordered copies
  ord <- order(x, y, method = "radix")
  xo <- x[ord]
  yo <- y[ord]

  # 2) compute group runs for equal x (use rle to avoid building big lists)
  if (anyDuplicated(xo)) {
    rle_x <- rle(xo)
    group_lengths <- rle_x$lengths
    group_starts <- cumsum(c(1L, head(group_lengths, -1L)))
    group_ends   <- cumsum(group_lengths)
    G <- length(group_lengths)  # number of groups
  } else {
    # fast path
    rle_x <- rle(xo)
    group_lengths <- rep.int(1L, n)
    group_starts <- 1:n
    group_ends   <- 1:n
    G <- n  # number of groups
  }

  # 3) compress y to ranks 1..m (strict ordering of unique y's)
  uniq_y <- sort(unique(yo))
  ry <- match(yo, uniq_y)      # integer vector 1..m
  m <- length(uniq_y)

  # allocate bit and result vectors (integers)
  bit <- integer(m)            # 1-indexed BIT (positions 1..m)
  less_right    <- integer(n)
  greater_right <- integer(n)
  # originally there were separate vectors, but we can reuse some memory
  # less_left     <- integer(n)
  # greater_left  <- integer(n)

  # precompute the results of bitwAnd so we can just do lookup
  # lowbits <- (1:m) - ((1:m) & ((1:m) - 1L))
  # lowbits <- (1:m) - bitwAnd((1:m), ((1:m) - 1L))
  #lowbits <- vapply(1:m, \(i) bitwAnd(i, -i), 0L)# -> lowbits
  lowbits <- bitwAnd(1:m, -1:-m)
  
  # Helper: inline bit_sum and bit_add are implemented as loops below
  # -------- Right sweep (groups processed from right to left) ----------
  total_seen <- 0L
  for (g in G:1L) {
    i1 <- group_starts[g]
    i2 <- group_ends[g]
    # query each index in this group (do NOT add them yet)
    for (i in i1:i2) {
      r <- ry[i]
      # less_right: bit_sum(r-1)
      j <- r - 1L
      s_less <- 0L
      while (j > 0L) {
        s_less <- s_less + bit[j]
        j <- j - lowbits[j] #bitwAnd(j, -j)
      }
      # leq_count: bit_sum(r)
      j <- r
      s_leq <- 0L
      while (j > 0L) {
        s_leq <- s_leq + bit[j]
        j <- j - lowbits[j] #bitwAnd(j, -j)
      }
      less_right[i] <- s_less
      greater_right[i] <- total_seen - s_leq
    }
    # now add this entire group into the BIT (so earlier groups see them)
    for (i in i1:i2) {
      r <- ry[i]
      j <- r
      while (j <= m) {
        bit[j] <- bit[j] + 1L
        j <- j + lowbits[j] #bitwAnd(j, -j)
      }
      total_seen <- total_seen + 1L
    }
  }
  out_xorder <- (greater_right - less_right)  # partial result
  # reset for left sweep, these are now less_left and greater_left
  less_right[] <- 0L
  greater_right[] <- 0L
  

  # -------- Left sweep (groups processed from left to right) ----------
  bit[] <- 0L   # reset BIT in-place (no new allocation)
  total_seen <- 0L
  for (g in 1L:G) {
    i1 <- group_starts[g]
    i2 <- group_ends[g]
    # query each index in this group (do NOT add them yet)
    for (i in i1:i2) {
      r <- ry[i]
      # less_left: bit_sum(r-1)
      j <- r - 1L
      s_less <- 0L
      while (j > 0L) {
        s_less <- s_less + bit[j]
        j <- j - lowbits[j] #bitwAnd(j, -j)
      }
      # leq_count: bit_sum(r)
      j <- r
      s_leq <- 0L
      while (j > 0L) {
        s_leq <- s_leq + bit[j]
        j <- j - lowbits[j] #bitwAnd(j, -j)
      }
      # less_left[i] <- s_less
      # greater_left[i] <- total_seen - s_leq
      less_right[i] <- s_less
      greater_right[i] <- total_seen - s_leq
    }
    # now add this group's ranks to BIT
    for (i in i1:i2) {
      r <- ry[i]
      j <- r
      while (j <= m) {
        bit[j] <- bit[j] + 1L
        j <- j + lowbits[j] #bitwAnd(j, -j)
      }
      total_seen <- total_seen + 1L
    }
  }

  out_xorder <- out_xorder + (less_right - greater_right)
  # reuse some memory
  less_right[] <- 0L
  less_right[ord] <- out_xorder
  return(less_right)
  
  # combine contributions (in x-sorted order), then restore original order
  # out_xorder <- (greater_right - less_right) + (less_left - greater_left)
  # out <- integer(n)
  # out[ord] <- out_xorder
  # out
}

5. Benchmarking

We compare the naive and optimized versions using the bench package.

concordance_naive_cpp <- function(x, y) jaspRegression:::concordance_naive_cpp(as.double(x), as.double(y))
concordance_fenwick_cpp <-  function(x, y) jaspRegression:::concordance_fenwick_cpp(as.double(x), as.double(y))

# results with ties
x <- sample(-5:5, 100, TRUE)
y <- sample(-5:5, 100, TRUE)
naive     = concordance_naive(x, y)
naive_cpp = concordance_naive_cpp(x, y)
fenwick   = concordance_fenwick_cpp(x, y)
all(naive == fenwick) && all(naive_cpp == fenwick)

[1] TRUE

# results without ties
x <- rnorm(10)
y <- rnorm(10)
naive     = concordance_naive(x, y)
naive_cpp = concordance_naive_cpp(x, y)
fenwick   = concordance_fenwick_cpp(x, y)
all(naive == fenwick) && all(naive_cpp == fenwick)

[1] TRUE

orders <- 3
ns <- c(sapply(seq_len(orders), \(o) {
  from <- 10^o
  to   <- 10^(o+1) - from
  seq(from, to, by = from) # e.g., 10, 100 - 10, 10
}), 10^(orders+1))
res <- suppressWarnings(bench::press(
  n = ns,
  {
    x <- rnorm(n)
    y <- rnorm(n)
    bench::mark(
      naive     = concordance_naive(x, y),
      naive_cpp = concordance_naive_cpp(x, y),
      fenwick   = concordance_fenwick_cpp(x, y),
      check = TRUE, # ensure results are checked for equality
      iterations = 3
    )
  },
  .quiet = !interactive()
)) # suppressWarnings to hide the message "Some expressions had a GC in every iteration; so filtering is disabled."

res

# A tibble: 84 × 7
   expression     n      min   median `itr/sec` mem_alloc `gc/sec`
   <bch:expr> <dbl> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
 1 naive         10  14.85µs  16.59µs    52159.     5.8KB        0
 2 naive_cpp     10   1.88µs   2.48µs   332664.        0B        0
 3 fenwick       10   2.14µs   3.05µs   248710.        0B        0
 4 naive         20   35.9µs  40.02µs    25825.    22.4KB        0
 5 naive_cpp     20   2.76µs   3.48µs   244976.        0B        0
 6 fenwick       20   2.64µs   3.86µs   213717.        0B        0
 7 naive         30   36.5µs     39µs    25039.    49.8KB        0
 8 naive_cpp     30   3.94µs   5.01µs   191579.        0B        0
 9 fenwick       30    3.1µs   4.63µs   193920.        0B        0
10 naive         40   55.7µs  56.01µs    17181.    88.6KB        0
# ℹ 74 more rows

6. Plot of runtime

res$algorithm <- as.character(res$expression)
idx_bad <- res$mem_alloc == 0
res$mem_alloc[idx_bad] <- min(res$mem_alloc[!idx_bad]) / 100  # to avoid divide by 0
lm_res <- res |>
  group_by(algorithm) |>
  summarize(
    # Fit the model once and extract coefficients
    time_fit = list(lm(log10(min) ~ log10(n))),
    mem_fit = list(lm(log10(mem_alloc) ~ log10(n))),
    .groups = "drop"
  ) |>
  mutate(
    time_intercept = unname(sapply(time_fit, \(x) coef(x)["(Intercept)"])),
    time_slope     = unname(sapply(time_fit, \(x) coef(x)["log10(n)"])),
    mem_intercept  = unname(sapply(mem_fit,  \(x) coef(x)["(Intercept)"])),
    mem_slope      = unname(sapply(mem_fit,  \(x) coef(x)["log10(n)"])),
    # Remove the temporary columns if you don't need them
    .keep = "unused"
  )

idx <- match(c("fenwick", "naive", "naive_cpp"), lm_res$algorithm)
i1 <- which(lm_res$algorithm == "fenwick")
i2 <- seq_along(lm_res$algorithm)[-i1]
intersection_n_time <- 10^(with(lm_res, (time_intercept[i1] - time_intercept[i2]) / (time_slope[i2] - time_slope[i1])))
intersection_n_mem  <- 10^(with(lm_res, (mem_intercept[i1] - mem_intercept[i2])   / (mem_slope[i2] - mem_slope[i1])))
intersection_y_time <- 10^(with(lm_res, (lm_res$time_intercept[i1] + lm_res$time_slope[i1] * log10(intersection_n_time))))
intersection_y_mem  <- 10^(with(lm_res, (lm_res$mem_intercept[i1] + lm_res$mem_slope[i1] * log10(intersection_n_mem))))

intersection_n_time_rounded <- round(intersection_n_time)

# df_intersect <- data.frame(
#   n_time = c(intersection_n_time, intersection_n_time),
#   n_mem  = c(intersection_n_mem, intersection_n_mem),
#   y_time = c(min(res$min),       as_bench_time(intersection_y_time)),
#   y_mem  = c(min(res$mem_alloc), as_bench_time(intersection_y_mem))
# )

idx <- match(c("fenwick", "naive"), lm_res$algorithm)
ref_lines <- data.frame(
  order = c("O(n)", "O(n^2)"),
  slope = c(1, 2),       # Reference slopes for O(n) and O(n^2)
  time_intercept = lm_res$time_intercept[idx],
  mem_intercept  = lm_res$mem_intercept[idx]
)

plt_time <- ggplot(res, aes(x = n, y = min, group = algorithm, color = algorithm)) +
  # scale_y_log10() +
  geom_point() +
  # geom_line(alpha = .4, linetype = "dashed") +
  scale_x_log10() +
  # geom_abline(data = lm_res,     mapping = aes(slope = time_slope, intercept = time_intercept, color = algorithm),  linetype = "dashed") +
  geom_abline(data = ref_lines,  mapping = aes(slope =      slope, intercept = time_intercept, color = order), linetype = "dotted") +
  # geom_line(data = df_intersect, mapping = aes(x = n_time, y = y_time), alpha = .5, color = "grey", inherit.aes = FALSE) +
  labs(y = "Minimum runtime") +
  ggtitle("Time")
plt_memory <- ggplot(res, aes(x = n, y = mem_alloc, group = algorithm, color = algorithm)) +
  # scale_y_log10() +
  geom_point() +
  # geom_line(alpha = .4, linetype = "dashed") +
  scale_x_log10() +
  # geom_abline(data = lm_res,    mapping = aes(slope = mem_slope, intercept = mem_intercept, color = algorithm), linetype = "dashed") +
  geom_abline(data = ref_lines, mapping = aes(slope =     slope, intercept = mem_intercept, color = order), linetype = "dotted") +
  # not meaningul, intersection is before n = 10
  # geom_line(data = df_intersect, mapping = aes(x = n_mem, y = y_mem), alpha = .5, color = "grey", inherit.aes = FALSE) +
  labs(y = "Allocated Memory") +
  ggtitle("Memory")


# plotly::subplot(plotly::ggplotly(plt_time), plotly::ggplotly(plt_memory), nrows = 1)
plot(plt_time + plt_memory + plot_layout(guides = "collect"))

7. Conclusions

On my machine, the Fenwick C++ approach outperforms the naive R approach around \(n=\) 2 and the naive C++ approach around \(n=\) 21. Theoretically, the naive approaches are \(\mathcal{O}(n^2)\) in time, while the Fenwick approach is \(\mathcal{O}(n \log n)\).
The naive R approach always uses more memory than the Fenwick and naive C++ approaches. Theoretically, the naive R approach is \(\mathcal{O}(n^2)\) in memory, while the Fenwick and naive C++ approaches are \(\mathcal{O}(n)\).
All C++ implementations use cpp11 (no Rcpp dependency) for faster compilation.