Fast Concordance Vector in R

1. The problem

We want, for each observation \(i\), the sum

\[ c_i = \sum_{j \ne i} \operatorname{sign}(x_j - x_i)\,\operatorname{sign}(y_j - y_i), \]

where sign is the usual sign function. This is essentially a per-observation contribution to Kendall’s tau.

Naively, this is \(\mathcal{O}(n^2)\), which is infeasible for large \(n\).

2. Naive R solution

The naive R implementation uses outer to compute all pairwise differences:

concordance_naive <- function(x, y) {
  dx <- sign(outer(x, x, `-`))
  dy <- sign(outer(y, y, `-`))
  .rowSums(dx * dy, nrow(dx), ncol(dx))
}

This is simple, and for small \(n\) it is actually very fast because everything is vectorized. However, both dx and dy are \(n \times n\) matrices, so this is \(\mathcal{O}(n^2)\) in memory and time.

2b. Naive Rcpp solution

A straightforward Rcpp solution is:

Rcpp::sourceCpp(code = r"{

#include <Rcpp.h>
using namespace Rcpp;

int sign(double x){
    return x == 0 ? 0 : x > 0 ? 1 : -1;
}

// Defined in Hollander, Wolfe & Chicken, Nonparametric Statistical Methods, 3ed., eq. (8.5), p. 394
int Q(double a, double b, double c, double d) {
    return sign((d-b)*(c-a));
}

// Defined in Hollander, Wolfe & Chicken, Nonparametric Statistical Methods, 3ed., eq. (8.37), p. 415
// [[Rcpp::export]]
IntegerVector concordanceVector_cpp(const NumericVector& x, const NumericVector& y){
    int n = x.size();
    IntegerVector out(n);

    for(int i = 0; i < n; i++){
        out[i] = 0;
        for(int t = 0; t < n; t++){
            if(i != t) out[i] += Q(x[i], y[i], x[t], y[t]);
        }
    }

    return out;
}
}"
)

3. Optimized solution (Fenwick Tree)

The key optimization is to avoid the \(n \times n\) matrix entirely. Steps:

Sort points by x. Then \(\operatorname{sign}(x_j - x_i)\) is determined by index order, i.e., \(\operatorname{sign}(x_j - x_i) = \operatorname{sign}(j - i)\).
Replace y with integer ranks.
Use a Fenwick Tree (Binary Indexed Tree) to efficiently count how many larger/smaller y values exist to the left and right of each index.
Combine these counts to get the concordance vector in \(O(n \log n)\).

This is the same trick used in fast Kendall’s tau algorithms, but adapted to return per-observation contributions.

4. Implementation of the optimized solution

# in-place Fenwick (handles ties in x and y; minimal allocations)
# the code is a bit messy because R does not allow mutation
concordance_fenwick_inplace <- function(x, y) {
  n <- length(x)
  stopifnot(length(y) == n)
  if (n == 0L) return(integer(0L))
  if (n == 1L) return(0L)

  # 1) order by x (stable), keep x-ordered copies
  ord <- order(x, y, method = "radix")
  xo <- x[ord]
  yo <- y[ord]

  # 2) compute group runs for equal x (use rle to avoid building big lists)
  if (anyDuplicated(xo)) {
    rle_x <- rle(xo)
    group_lengths <- rle_x$lengths
    group_starts <- cumsum(c(1L, head(group_lengths, -1L)))
    group_ends   <- cumsum(group_lengths)
    G <- length(group_lengths)  # number of groups
  } else {
    # fast path
    rle_x <- rle(xo)
    group_lengths <- rep.int(1L, n)
    group_starts <- 1:n
    group_ends   <- 1:n
    G <- n  # number of groups
  }

  # 3) compress y to ranks 1..m (strict ordering of unique y's)
  uniq_y <- sort(unique(yo))
  ry <- match(yo, uniq_y)      # integer vector 1..m
  m <- length(uniq_y)

  # allocate bit and result vectors (integers)
  bit <- integer(m)            # 1-indexed BIT (positions 1..m)
  less_right    <- integer(n)
  greater_right <- integer(n)
  # originally there were separate vectors, but we can reuse some memory
  # less_left     <- integer(n)
  # greater_left  <- integer(n)

  # precompute the results of bitwAnd so we can just do lookup
  # lowbits <- (1:m) - ((1:m) & ((1:m) - 1L))
  # lowbits <- (1:m) - bitwAnd((1:m), ((1:m) - 1L))
  #lowbits <- vapply(1:m, \(i) bitwAnd(i, -i), 0L)# -> lowbits
  lowbits <- bitwAnd(1:m, -1:-m)
  
  # Helper: inline bit_sum and bit_add are implemented as loops below
  # -------- Right sweep (groups processed from right to left) ----------
  total_seen <- 0L
  for (g in G:1L) {
    i1 <- group_starts[g]
    i2 <- group_ends[g]
    # query each index in this group (do NOT add them yet)
    for (i in i1:i2) {
      r <- ry[i]
      # less_right: bit_sum(r-1)
      j <- r - 1L
      s_less <- 0L
      while (j > 0L) {
        s_less <- s_less + bit[j]
        j <- j - lowbits[j] #bitwAnd(j, -j)
      }
      # leq_count: bit_sum(r)
      j <- r
      s_leq <- 0L
      while (j > 0L) {
        s_leq <- s_leq + bit[j]
        j <- j - lowbits[j] #bitwAnd(j, -j)
      }
      less_right[i] <- s_less
      greater_right[i] <- total_seen - s_leq
    }
    # now add this entire group into the BIT (so earlier groups see them)
    for (i in i1:i2) {
      r <- ry[i]
      j <- r
      while (j <= m) {
        bit[j] <- bit[j] + 1L
        j <- j + lowbits[j] #bitwAnd(j, -j)
      }
      total_seen <- total_seen + 1L
    }
  }
  out_xorder <- (greater_right - less_right)  # partial result
  # reset for left sweep, these are now less_left and greater_left
  less_right[] <- 0L
  greater_right[] <- 0L
  

  # -------- Left sweep (groups processed from left to right) ----------
  bit[] <- 0L   # reset BIT in-place (no new allocation)
  total_seen <- 0L
  for (g in 1L:G) {
    i1 <- group_starts[g]
    i2 <- group_ends[g]
    # query each index in this group (do NOT add them yet)
    for (i in i1:i2) {
      r <- ry[i]
      # less_left: bit_sum(r-1)
      j <- r - 1L
      s_less <- 0L
      while (j > 0L) {
        s_less <- s_less + bit[j]
        j <- j - lowbits[j] #bitwAnd(j, -j)
      }
      # leq_count: bit_sum(r)
      j <- r
      s_leq <- 0L
      while (j > 0L) {
        s_leq <- s_leq + bit[j]
        j <- j - lowbits[j] #bitwAnd(j, -j)
      }
      # less_left[i] <- s_less
      # greater_left[i] <- total_seen - s_leq
      less_right[i] <- s_less
      greater_right[i] <- total_seen - s_leq
    }
    # now add this group's ranks to BIT
    for (i in i1:i2) {
      r <- ry[i]
      j <- r
      while (j <= m) {
        bit[j] <- bit[j] + 1L
        j <- j + lowbits[j] #bitwAnd(j, -j)
      }
      total_seen <- total_seen + 1L
    }
  }

  out_xorder <- out_xorder + (less_right - greater_right)
  # reuse some memory
  less_right[] <- 0L
  less_right[ord] <- out_xorder
  return(less_right)
  
  # combine contributions (in x-sorted order), then restore original order
  # out_xorder <- (greater_right - less_right) + (less_left - greater_left)
  # out <- integer(n)
  # out[ord] <- out_xorder
  # out
}

5. Benchmarking

We compare the naive and optimized versions using the bench package.

# results with ties
x <- sample(-5:5, 100, TRUE)
y <- sample(-5:5, 100, TRUE)
naive     = concordance_naive(x, y)
naive_cpp = concordanceVector_cpp(x, y)
fenwick2  = concordance_fenwick_inplace(x, y)
all(naive == fenwick2) && all(naive_cpp == fenwick2)

[1] TRUE

# results without ties
x <- rnorm(10)
y <- rnorm(10)
naive     = concordance_naive(x, y)
naive_cpp = concordanceVector_cpp(x, y)
fenwick2  = concordance_fenwick_inplace(x, y)
all(naive == fenwick2) && all(naive_cpp == fenwick2)

[1] TRUE

# profvis::profvis({
#   for (i in 1:10000)
#     concordance_fenwick_inplace(x, y)
# })

orders <- 3
ns <- c(sapply(seq_len(orders), \(o) {
  from <- 10^o
  to   <- 10^(o+1) - from
  seq(from, to, by = from) # e.g., 10, 100 - 10, 10
}), 10^(orders+1))
res <- suppressWarnings(bench::press(
  n = ns,
  {
    x <- rnorm(n)
    y <- rnorm(n)
    bench::mark(
      naive     = concordance_naive(x, y),
      naive_cpp = concordanceVector_cpp(x, y),
      fenwick   = concordance_fenwick_inplace(x, y),
      check = TRUE, # ensure results are checked for equality
      iterations = 3
    )
  },
  .quiet = !interactive()
)) # suppressWarnings to hide the message "Some expressions had a GC in every iteration; so filtering is disabled."

res

# A tibble: 84 × 7
   expression     n      min   median `itr/sec` mem_alloc `gc/sec`
   <bch:expr> <dbl> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
 1 naive         10  15.75µs  17.71µs    50173.     5.8KB        0
 2 naive_cpp     10   1.89µs   2.47µs   328722.        0B        0
 3 fenwick       10  125.5µs 145.73µs     6575.        0B        0
 4 naive         20  21.59µs  24.81µs    39850.   22.41KB        0
 5 naive_cpp     20   2.77µs   4.43µs   218560.        0B        0
 6 fenwick       20  138.2µs 158.46µs     6469.     2.7KB        0
 7 naive         30   38.7µs  40.75µs    23444.   49.83KB        0
 8 naive_cpp     30   5.39µs   5.69µs   145643.        0B        0
 9 fenwick       30 165.59µs 178.94µs     5586.    3.41KB        0
10 naive         40  75.67µs   76.8µs    12557.   88.59KB        0
# ℹ 74 more rows

6. Plot of runtime

res$algorithm <- as.character(res$expression)
idx_bad <- res$mem_alloc == 0
res$mem_alloc[idx_bad] <- min(res$mem_alloc[!idx_bad]) / 100  # to avoid divide by 0
lm_res <- res |>
  group_by(algorithm) |>
  summarize(
    # Fit the model once and extract coefficients
    time_fit = list(lm(log10(min) ~ log10(n))),
    mem_fit = list(lm(log10(mem_alloc) ~ log10(n))),
    .groups = "drop"
  ) |>
  mutate(
    time_intercept = unname(sapply(time_fit, \(x) coef(x)["(Intercept)"])),
    time_slope     = unname(sapply(time_fit, \(x) coef(x)["log10(n)"])),
    mem_intercept  = unname(sapply(mem_fit,  \(x) coef(x)["(Intercept)"])),
    mem_slope      = unname(sapply(mem_fit,  \(x) coef(x)["log10(n)"])),
    # Remove the temporary columns if you don't need them
    .keep = "unused"
  )

idx <- match(c("fenwick", "naive", "naive_cpp"), lm_res$algorithm)
i1 <- which(lm_res$algorithm == "fenwick")
i2 <- seq_along(lm_res$algorithm)[-i1]
intersection_n_time <- 10^(with(lm_res, (time_intercept[i1] - time_intercept[i2]) / (time_slope[i2] - time_slope[i1])))
intersection_n_mem  <- 10^(with(lm_res, (mem_intercept[i1] - mem_intercept[i2])   / (mem_slope[i2] - mem_slope[i1])))
intersection_y_time <- 10^(with(lm_res, (lm_res$time_intercept[i1] + lm_res$time_slope[i1] * log10(intersection_n_time))))
intersection_y_mem  <- 10^(with(lm_res, (lm_res$mem_intercept[i1] + lm_res$mem_slope[i1] * log10(intersection_n_mem))))

intersection_n_time_rounded <- round(intersection_n_time)

# df_intersect <- data.frame(
#   n_time = c(intersection_n_time, intersection_n_time),
#   n_mem  = c(intersection_n_mem, intersection_n_mem),
#   y_time = c(min(res$min),       as_bench_time(intersection_y_time)),
#   y_mem  = c(min(res$mem_alloc), as_bench_time(intersection_y_mem))
# )

idx <- match(c("fenwick", "naive"), lm_res$algorithm)
ref_lines <- data.frame(
  order = c("O(n)", "O(n^2)"),
  slope = c(1, 2),       # Reference slopes for O(n) and O(n^2)
  time_intercept = lm_res$time_intercept[idx],
  mem_intercept  = lm_res$mem_intercept[idx]
)

plt_time <- ggplot(res, aes(x = n, y = min, group = algorithm, color = algorithm)) +
  # scale_y_log10() +
  geom_point() +
  # geom_line(alpha = .4, linetype = "dashed") +
  scale_x_log10() +
  # geom_abline(data = lm_res,     mapping = aes(slope = time_slope, intercept = time_intercept, color = algorithm),  linetype = "dashed") +
  geom_abline(data = ref_lines,  mapping = aes(slope =      slope, intercept = time_intercept, color = order), linetype = "dotted") +
  # geom_line(data = df_intersect, mapping = aes(x = n_time, y = y_time), alpha = .5, color = "grey", inherit.aes = FALSE) +
  labs(y = "Minimum runtime") +
  ggtitle("Time")
plt_memory <- ggplot(res, aes(x = n, y = mem_alloc, group = algorithm, color = algorithm)) +
  # scale_y_log10() +
  geom_point() +
  # geom_line(alpha = .4, linetype = "dashed") +
  scale_x_log10() +
  # geom_abline(data = lm_res,    mapping = aes(slope = mem_slope, intercept = mem_intercept, color = algorithm), linetype = "dashed") +
  geom_abline(data = ref_lines, mapping = aes(slope =     slope, intercept = mem_intercept, color = order), linetype = "dotted") +
  # not meaningul, intersection is before n = 10
  # geom_line(data = df_intersect, mapping = aes(x = n_mem, y = y_mem), alpha = .5, color = "grey", inherit.aes = FALSE) +
  labs(y = "Allocated Memory") +
  ggtitle("Memory")


# plotly::subplot(plotly::ggplotly(plt_time), plotly::ggplotly(plt_memory), nrows = 1)
plot(plt_time + plt_memory + plot_layout(guides = "collect"))

7. Conclusions

On my machine, the Fenwick approach outperforms the naive approach around \(n=\) 138 and the naive C++ approach around \(n=\) 608. Theoretically, the naive approach is \(\mathcal{O}(n^2)\) in time, while the Fenwick approaches is \(\mathcal{O}(n \log n)\).
The naive approach always uses more memory than the fenwick approach, which uses more memory than the C++ approach. Theoretically, the naive approach is \(\mathcal{O}(n^2)\) in memory, while the Fenwick and C++ approaches are \(\mathcal{O}(n)\).