Yes, all the M-series have more cores, they often have better thermal management, and they have more memory bandwidth. (The the Neo still has crazy high bandwidth.) But, for a single threaded, strictly compute task that runs in 10 seconds, it outperforms the M4 cores. I don't know why, I'm just sharing my experience.
The actual code I am using for this is:
#include <stdio.h>
#include <sys/time.h>
int gettimeofday(struct timeval *tp, void *tzp);
int main(int argc, char *argv[]) {
double xmin, xmax;
double ymin, ymax;
double x, y, xs, ys;
int max_iter;
int i, px, py;
int width, height;
volatile double wx, wy, t;
double start, now;
struct timeval tv;
int count;
xmin = -2.0;
ymin = -1.5;
xmax = 1.0;
ymax = 1.5;
max_iter = 1000;
width = 200;
height = 200;
xs = (xmax - xmin) / (double) width;
ys = (ymax - ymin) / (double) height;
gettimeofday(&tv, NULL);
start = (tv.tv_sec * 1000000.0 + tv.tv_usec) / 1000000.0;
count = 0;
now = start;
while (now - start < 10.0 /* && count == 0 */) {
for (y=ymin, py = 0; py < height; py++, y += ys) {
for (x=xmin, px = 0; px < width; px++, x += xs) {
wx = 0.0;
wy = 0.0;
for (i=0; i < max_iter && (wx * wx + wy * wy) < 4; i++) {
t = wx * wx - wy * wy + x;
wy = 2 * wx * wy + y;
wx = t;
}
}
}
gettimeofday(&tv, NULL);
now = (tv.tv_sec * 1000000.0 + tv.tv_usec) / 1000000.0;
count ++;
}
printf("%d iterations in %.2f seconds.\n", count, now - start);
}