They are calculating F(99999999) mod 9837. This can be done by 54 matrix multiplications of 2x2 matrices. It takes nanoseconds on a CPU. There's absolutely no point in using the GPU for this.
Yeah, I measure ~250 nanoseconds on CPU single shot. ~125 nanos amortized if repeated 100 times. It's fast enough that I'm not sure I'm benchmarking the method, as opposed to overheads like reading the time.
#include <stdio.h>
#include <time.h>
int fib_mod_9837(int n) {
int a = 0;
int b = 1;
int k = 1;
while (k < n) {
k <<= 1;
}
while (k) {
int a2 = a*a;
int b2 = b*b;
int ab2 = a*b << 1;
a = a2 + ab2;
b = a2 + b2;
if (n & k) {
int t = a;
a += b;
b = t;
}
a %= 9837;
b %= 9837;
k >>= 1;
}
return a;
}
int main() {
struct timespec begin, end;
clock_gettime(CLOCK_MONOTONIC_RAW, &begin);
int result = fib_mod_9837(99999999);
clock_gettime(CLOCK_MONOTONIC_RAW, &end);
printf("%lld\n", result);
printf("elapsed nanos: %lld\n", (end.tv_nsec - begin.tv_nsec) + (end.tv_sec - begin.tv_sec)*1000000000);
}
Yeah, I measure ~250 nanoseconds on CPU single shot. ~125 nanos amortized if repeated 100 times. It's fast enough that I'm not sure I'm benchmarking the method, as opposed to overheads like reading the time.