#include <stdio.h>
#include <stdlib.h>
#include "types.h"
#include "align.h"

/* Basic convolution-from-polynomial-product defs, use degree-2 polys
	A = a0+a1.x+a2.x^2, B = b0+b1.x+b2.x^2 to illustrate:
Polynomial-product [a0,a1,a2] * [b0,b1,b2] has terms (coefficients)
	x^0	a0.b0
	x^1	a0.b1+a1.b0
	x^2	a0.b2+a1.b1+a2.b0
	x^3	      a1.b2+a2.b1
	x^4	            a2.b2
Modulo (x^3-1) gives a cyclic convo:
	x^0	a0.b0+a1.b2+a2.b1
	x^1	a0.b1+a1.b0+a2.b2
	x^2	a0.b2+a1.b1+a2.b0
Modulo (x^3+1) gives an acyclic convo:
	x^0	a0.b0-a1.b2-a2.b1
	x^1	a0.b1+a1.b0-a2.b2
	x^2	a0.b2+a1.b1+a2.b0
*/
// Bytewise POPCNT:
const uint8 pop8[256] = {
	0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
	1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
	1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
	2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8
};

void ASSERT(long line, char*file, int expr, char*assert_string) {
	/* Define a convenient spot to set a breakpoint: */
	if(!expr) {
		fprintf(stderr,"ERROR: at line %lu of file %s\n", line, file);	fprintf(stderr,"Assertion failed: %s\n", assert_string);
		/* Flush all output streams prior to asserting. We replace the original assert(0) call with
		an exit(EXIT_FAILURE), since some compilers seem to like to optimize away assertions. */
		fflush(NULL);
		exit(EXIT_FAILURE);	// Try to make this line coincide with a line # == 0 (mod 100) to ease breakpointing
	}
}

// Macro to efficiently check whether 2 n-vectors are cshifted versions of each other:
int vec_eq_modulo_cshift(uint8 _vec0[], uint8 _vec1[], int _n) {
	uint32 _i,_j,_offset;
	for(_i = 0; _i < _n; _i++) {
		if(_vec1[_i] == _vec0[0]) {
			_offset = _i;
			break;
		}
	}
	if(_i == _n)	/* Failed to find vec0[0] in vec1 */
		return 0;
	for(_i = 0; _i < _n; _i++) {
		_j = (_i + _offset) % _n;
		if(_vec1[_j] != _vec0[_i]) {
			return 0;
		}
	}
	return 1;
}

/*
Implements Even's improved version of Steinhaus-Johnson-Trotter n-perm-generating algorithm
(https://en.wikipedia.org/wiki/Steinhaus%E2%80%93Johnson%E2%80%93Trotter_algorithm),
illustrated here using 3-perms. The only modification in the actual implementation is the use
of 0-offset indexing, e.g. out initial state = 0,-1,-2,...,-(n-1) rather than 1,-2,-3,...,-n:

0. Initially, the direction of the number 1 is zero, and all other elements have a negative direction:
	 0 −1 −2	= parr[0,1,2]
1. At each step, the algorithm finds the largest element with a nonzero direction, and swaps it in the indicated direction:
	 0 −2 −1	largest = parr[1] = -2, read "magnitude 2, direction -", swap -2 leftward
	-2  0 -1	note: if curr elt's value = n and has +- dir, do not need to re-search array for a new largest-elt-with-nonzero-direction
	 2 −1  0	largest = parr[1] = -1, swap -1 leftward
	 1 +2  0	largest = parr[0] = +2, swap +2 rightward
	 1  0 +2	largest = parr[1] = +2, swap +2 rightward
2. If this causes the chosen element to reach the first or last position within the permutation, or if the next element in the same direction is larger than the chosen element, the direction of the chosen element is set to zero:
	 0 -2 -1	no change
	 2  0 −1	curr-elt (-2) reached lcol, set dir = 0
	 2  1  0	next-elt in same (-1) dir as curr-elt -1 is 2 which is larger, set dir of -1 to 0
	 1 +2  0	no change
	 1  0  2	curr-elt (+2) reached lcol, set dir = 0
3. After each step, all elements greater than the chosen element have their directions set to positive or negative, according to whether they are between the chosen element and the start or the end of the permutation respectively. Thus, in this example, when the number 1 moves, the number 2 becomes marked with a direction again:
	 0 -2 -1	no change, since -2 is max elt
	 2  0 −1	no change, since 2 is max elt
	+2  1  0	2 > 1 and lies to the left of 2, set dir = +
	 1 +2  0	no change, since +2 is max elt
	 1  0  2	no change, since 2 is max elt
4. if at least one elt has nonzero sign, goto [1], otherwise when all numbers become unmarked, the algorithm terminates.
	 0 -2 -1	goto [1]
	 2  0 −1	goto [1]
	+2  1  0	goto [1]
	 1 +2  0	goto [1]
	 1  0  2	done.

if init_perm = True, algo inits internal perm-array according to Even's scheme = [0,-1,-2,...,-(n-1)].
At end of each call returns an updated perm-array whose elements are just those of the internal perm-array
sans the directional (sign 0,+,-) metadata.
An init-mode call sets parr[] = 0,1,2,...,n-1 and returns 1 in addition to doing the aformentioned internal-state inits.

Function return value = 1 if there was at least 1 perm remaining to do, i.e. if current call updated parr[].
Function return value = 0 if there were no perms remaining to do, in which case there is no update of parr[].
IT IS ASSUMED THAT CALLER WILL ONLY BE READING PARR[], NOT MODIFYING ITS CONTENTS BETWEEN CALLS.
IT IS ASSUMED THAT THAT ANY CHANGE ON PERM-LENGTH ON PART OF CALLER WILL BE ANNOUNCED VIA AN INIT-MODE CALL.
*/
int	nperm(int parr[], int init_perm, int n)
{
	int i, dir,val, curr_dir,curr_idx,curr_val, tmp;
	const int smask = 0x3fffffff;
	static int *pstate = 0x0, nperms = 0;
	const char sgn[4] = {'-',' ','+','@'};	// Index-into is [dir+1], e.g. dir = -1 maps to sgn[0] = '-'
	ASSERT(HERE, n > 0, "Permutation length must be > 0!");
	if(init_perm) {
		pstate = ALLOC_INT(pstate, n);
		// 0. Initially, the direction of the number 1 is zero, and all other elements have a negative direction:
		pstate[0] = 0;
		for(i = 1; i < n; i++) {
			pstate[i] = (0x3 << 30) + i;	// Direction 0,+,- encoded via top 2 bits = 0x0,0x1,0x3, resp.
		}
		for(i = 0; i < n; i++) {
			parr[i] = pstate[i] & smask;	// User-visible perm-array is just internal one sans directional (sign 0,+,-) metadata.
		}
	//	printf("perm[%d] = ",nperms);	for(i = 0; i < n; i++) { dir = pstate[i] >> 30; val = pstate[i] & smask; printf("%c%d ",sgn[dir+1],val); }	printf("\n");
		nperms++;
		return 1;
	}
	// 1. At each step, the algorithm finds the largest element with a nonzero direction:
	curr_val = -1;
	for(i = 0; i < n; i++) {
		dir = (pstate[i] >> 30);	// Assumes signed shift gets done, i.e. dir = 2s-complement 0,+1,-1
		if(!dir) continue;
		ASSERT(HERE, dir == -1 || dir == +1, "Invalid direction value!");
		val = pstate[i] & smask;
		if(val > curr_val) {
			curr_dir = dir;
			curr_val = val;
			curr_idx = i;
		}
	}
	// 4. if no elt has nonzero sign, no more perms to do:
	if(curr_val == -1) {
	//	printf("No perms left! nperms = %d.\n",nperms);
		return 0;
	}
	// 2a. Swap largest-element-with-a-nonzero-direction in the indicated direction.
	tmp = pstate[curr_idx];
	pstate[curr_idx] = pstate[curr_idx + curr_dir];
	pstate[curr_idx + curr_dir] = tmp;
	curr_idx += curr_dir;
	// 2b. If this causes the chosen element to reach the first or last position within
	// the permutation, the direction of the chosen element is set to zero:
	if(curr_idx == 0 || curr_idx == n-1)
		pstate[curr_idx] &= smask;
	// 2c. ...or if the next element in the same direction is larger than
	// the chosen element, the direction of the chosen element is set to zero:
	else if((pstate[curr_idx] & smask) < (pstate[curr_idx + curr_dir] & smask))
		pstate[curr_idx] &= smask;

	// 3. After each step, all elts > chosen element have their dirs set to + or -, according to
	// whether they are between chosen element and start or end of the permutation respectively:
	for(i = 0; i < curr_idx; i++) {	// Elts left of chosen elt
		val = pstate[i] & smask;
		if(val > curr_val) {
			pstate[i] &= smask;	// Zero the current-dir...
			pstate[i] += (0x1 << 30);		// ...and replace with +.
		}
	}
	for(i = curr_idx+1; i < n; i++) {	// Elts right of chosen elt
		val = pstate[i] & smask;
		if(val > curr_val) {
			pstate[i] &= smask;	// Zero the current-dir...
			pstate[i] += (0x3 << 30);		// ...and replace with -.
		}
	}
	for(i = 0; i < n; i++) {
		parr[i] = pstate[i] & smask;	// User-visible perm-array is just internal one sans directional (sign 0,+,-) metadata.
	}
//	printf("perm[%d] = ",nperms);	for(i = 0; i < n; i++) { dir = pstate[i] >> 30; val = pstate[i] & smask; printf("%c%d ",sgn[dir+1],val); }	printf("\n");
	nperms++;
	return 1;
}

// Simple program to take the (n-1)/2 x (n-1)/2 sine-term circulant submatrix resulting
// from a prime-length-n complex DFT matrix decomposition and find the combination of sine-
// term and x-term sign flips which minimizes the number of opposite-signed terms in the
// length - (n-1)/2 convolution resulting from said circulant submatrix:
int main()
{
#ifndef N
	#define N	11
#endif
	const int nc = (N-1)/2, imax = 1<<nc;	// nc = length of circulant-submatrix convolution
	uint64 i,j,flip,mask1,mask2;
	uint8 popc,popt;
	uint32 nums, mins = -1;	// Init mins to max-uint32

#if (N > 17)
	#error Selected N exceeds max supported of 17!
#elif (N == 11)
/*
11-DFT:
	S1 = s1*(x1-xA)+s2*(x2-x9)+s3*(x3-x8)+s4*(x4-x7)+s5*(x5-x6)
	S2 = s2*(x1-xA)+s4*(x2-x9)-s5*(x3-x8)-s3*(x4-x7)-s1*(x5-x6)
	S3 = s3*(x1-xA)-s5*(x2-x9)-s2*(x3-x8)+s1*(x4-x7)+s4*(x5-x6)
	S4 = s4*(x1-xA)-s3*(x2-x9)+s1*(x3-x8)+s5*(x4-x7)-s2*(x5-x6)
	S5 = s5*(x1-xA)-s1*(x2-x9)+s4*(x3-x8)-s2*(x4-x7)+s3*(x5-x6),
*** How to reindex into convolution? here are the per-row s-index patterns:
	S1	12345
	S2	24531
	S3	35214
	S4	43152
	S5	51423
Seek a 5-perm which turns these into 5 circularly-shifted versions of 12345.
	Swapping rows corr. to swapping convo output indices
	Swapping cols corr. to swapping sine term indices
Simply swapping output S1 into middle slot makes the 5 cols vertically-cshifted images of each other:
	S2	24531
	S3	35214
	S1	12345
	S4	43152
	S5	51423
Analogously, if swap col 1 into middle-col slot, we get 5 rows which are horiz-cshifts w.r.to each other:
	S1	23145
	S2	45231
	S3	52314
	S4	31452
	S5	14523 ,
and munge output ordering to make rows 2-5 monotonically-increasing-shiftcount cshifts w.r.to row 1:
	S1	23145
	S4	31452 = S1<<1
	S5	14523 = S1<<2
	S2	45231 = S1<<3
	S3	52314 = S1<<4
In terms of our original matrix-mul-convo this is
	S1 = +s2*(x2-x9)+s3*(x3-x8)+s1*(x1-xA)+s4*(x4-x7)+s5*(x5-x6)
	S3 = -s5*(x2-x9)-s2*(x3-x8)+s3*(x1-xA)+s1*(x4-x7)+s4*(x5-x6)
	S2 = +s4*(x2-x9)-s5*(x3-x8)+s2*(x1-xA)-s3*(x4-x7)-s1*(x5-x6)
	S5 = -s1*(x2-x9)+s4*(x3-x8)+s5*(x1-xA)-s2*(x4-x7)+s3*(x5-x6)
	S4 = -s3*(x2-x9)+s1*(x3-x8)+s4*(x1-xA)+s5*(x4-x7)-s2*(x5-x6)
so relabeling s23145 = a01234 and y0-4 = x[2-9,3-8,1-A,4-7,5-6] we have
	S1 = +a0*y0+a1*y1+a2*y2+a3*y3+a4*y4
	S4 = -a1*y0+a2*y1+a3*y2+a4*y3-a0*y4
	S5 = -a2*y0+a3*y1+a4*y2-a0*y3+a1*y4
	S2 = +a3*y0-a4*y1+a0*y2-a1*y3-a2*y4
	S3 = -a4*y0-a0*y1+a1*y2+a2*y3+a3*y4 , which is in form of a standard 5-convo except for the - signs.
Thus the pattern of signs (0 = positive, 1 = negative) is, with LSB corr. to y0 and MSB to y4:
	00000 = 0x00
	10001 = 0x11
	01001 = 0x09
	11010 = 0x1a
	00011 = 0x03 .
Flipping signs of a0,y0 we have
	S1 = +a0*y0+a1*y1+a2*y2+a3*y3+a4*y4
	S4 = +a1*y0+a2*y1+a3*y2+a4*y3+a0*y4
	S5 = +a2*y0+a3*y1+a4*y2+a0*y3+a1*y4
	S2 = -a3*y0-a4*y1-a0*y2-a1*y3-a2*y4
	S3 = +a4*y0+a0*y1+a1*y2+a2*y3+a3*y4 , which is in form of a standard 5-convo once we flip the sign of the S2 output.
*/
	#define cshift(x,i) (((x<<(5-i))&mask) + (x>>i)) // Rightward circular shift of 5-bit int x by i bits
	const uint64 rows = (0x03<<20)+(0x1a<<15)+(0x09<<10)+(0x11<<5)+0x0;	// Concatenate rowwise bit patterns into uint64 ...
	const uint64 mask = 0x1f;											// this limits us to nc = 8 (N = 17), obviously.
	printf("N = 11:\n");
	for(i = 0; i < imax; i++) {	// Loop over all 2^nc sine-terms sign patterns
		mask1 = (uint64)i;	mask1 += (mask1<<5)+(mask1<<10)+(mask1<<15)+(mask1<<20);	// This mask is same for each term of our convolution, so just duplicate 5 times
		for(j = 0; j < imax; j++) {	// Loop over all 2^nc x-terms sign patterns
			// This mask gets left-cshifted one more per row:
			mask2 = (uint64)j;	mask2 += (cshift(mask2,1)<<5)+(cshift(mask2,2)<<10)+(cshift(mask2,3)<<15)+(cshift(mask2,4)<<20);
			flip = rows ^ mask1 ^ mask2;	// Apply s-term and x-terms flips
/*
if(i == 1 && j <= 5)
	printf("I,J = %u,%u, m1,2,1^2 = %llX,%llX,%llX, rows^(m1^m2) = %llX, (rows^m1)^m2 = %llX: popc-values = %u,%u,%u,%u,%u\n",i,j,mask1,mask2,mask1^mask2,rows^(mask1^mask2),(rows^mask1)^mask2,pop8[flip & mask],pop8[(flip>> 5) & mask],pop8[(flip>>10) & mask],pop8[(flip>>15) & mask],pop8[(flip>>20) & mask]);
*/
			popt = 0;
			popc = pop8[(flip    ) & mask]; popc = MIN(popc, nc-popc); popt += popc;
			popc = pop8[(flip>> 5) & mask]; popc = MIN(popc, nc-popc); popt += popc;
			popc = pop8[(flip>>10) & mask]; popc = MIN(popc, nc-popc); popt += popc;
			popc = pop8[(flip>>15) & mask]; popc = MIN(popc, nc-popc); popt += popc;
			popc = pop8[(flip>>20) & mask]; popc = MIN(popc, nc-popc); popt += popc;
			nums = popt;
			if(nums < mins) {
				mins = nums;
				printf("\tWith xflip = %llX, sflip = %llX: #sign-flips = %d\n",i,j,mins);
			}
		}
	}

#elif (N == 13)
  #if 0	// After we find the needed convo-izing output-perm, disable this code:
	const uint8 v0[] = {1,2,3,4,5,6}, v1[] = {2,4,6,5,3,1}, v2[] = {3,6,4,1,2,5}, v3[] = {4,5,1,3,6,2}, v4[] = {5,3,2,6,1,4}, v5[] = {6,1,5,2,4,3};
	uint8 vec0[6], vec1[6], vec2[6], vec3[6], vec4[6], vec5[6];
	uint32 k,jp, perm6[6];
	for(i = 0; i < 720; i++) {
		j = nperm(perm6, i == 0, 6);	ASSERT(HERE, j != 0, "Ran out of perms!");
		for(j = 0; j < 6; j++) {
			jp = perm6[j];
			vec0[j] = v0[jp];	vec1[j] = v1[jp];	vec2[j] = v2[jp];	vec3[j] = v3[jp];	vec4[j] = v4[jp];	vec5[j] = v5[jp];
		}
/*
	printf("Trying perm[%u,%u,%u,%u,%u,%u]: vec0 = [%u,%u,%u,%u,%u,%u], vec1 = [%u,%u,%u,%u,%u,%u]\n",
		perm6[0],perm6[1],perm6[2],perm6[3],perm6[4],perm6[5],
		vec0[0],vec0[1],vec0[2],vec0[3],vec0[4],vec0[5],
		vec1[0],vec1[1],vec1[2],vec1[3],vec1[4],vec1[5]
	);
*/
		if(!vec_eq_modulo_cshift(vec0,vec1,nc))
			continue;
		if(!vec_eq_modulo_cshift(vec0,vec2,nc))
			continue;
		if(!vec_eq_modulo_cshift(vec0,vec3,nc))
			continue;
		if(!vec_eq_modulo_cshift(vec0,vec4,nc))
			continue;
		if(!vec_eq_modulo_cshift(vec0,vec5,nc))
			continue;
		printf("Found s-coeff perm:");
		for(j = 0; j < 6; j++) {
			printf("%u.",perm6[j]);
		}
		printf("\n");
	}
	exit(0);
  #endif
	/*
	First suitable perm = a[0,1,3,4,2,5] = s[1,2,3,4,5,6], gives vec0 = [1,2,4,5,3,6], vec1 = [2,4,5,3,6,1] = vec0<<1
	Can get this by shifting rows 4,5 up 1 space, or moving row 3 down 2 spaces to make it row 5:
	+s1*(x1-x12)+s2*(x2-x11)+s3*(x3-x10)+s4*(x4-x9)+s5*(x5-x8)+s6*(x6-x7) = S1
	+s2*(x1-x12)+s4*(x2-x11)+s6*(x3-x10)-s5*(x4-x9)-s3*(x5-x8)-s1*(x6-x7) = S2
	+s4*(x1-x12)-s5*(x2-x11)-s1*(x3-x10)+s3*(x4-x9)-s6*(x5-x8)-s2*(x6-x7) = S4
	+s5*(x1-x12)-s3*(x2-x11)+s2*(x3-x10)-s6*(x4-x9)-s1*(x5-x8)+s4*(x6-x7) = S5
	+s3*(x1-x12)+s6*(x2-x11)-s4*(x3-x10)-s1*(x4-x9)+s2*(x5-x8)+s5*(x6-x7) = S3
	+s6*(x1-x12)-s1*(x2-x11)+s5*(x3-x10)-s2*(x4-x9)+s4*(x5-x8)-s3*(x6-x7) = S6 .
	Now relabeling the outputs-permuted convolution coeffs as a[0,1,2,3,4,5] = s[1,2,4,5,3,6]:
	+a0*(x1-x12)+a1*(x2-x11)+a4*(x3-x10)+a2*(x4-x9)+a3*(x5-x8)+a5*(x6-x7) = S1
	+a1*(x1-x12)+a2*(x2-x11)+a5*(x3-x10)-a3*(x4-x9)-a4*(x5-x8)-a0*(x6-x7) = S2
	+a2*(x1-x12)-a3*(x2-x11)-a0*(x3-x10)+a4*(x4-x9)-a5*(x5-x8)-a1*(x6-x7) = S4
	+a3*(x1-x12)-a4*(x2-x11)+a1*(x3-x10)-a5*(x4-x9)-a0*(x5-x8)+a2*(x6-x7) = S5
	+a4*(x1-x12)+a5*(x2-x11)-a2*(x3-x10)-a0*(x4-x9)+a1*(x5-x8)+a3*(x6-x7) = S3
	+a5*(x1-x12)-a0*(x2-x11)+a3*(x3-x10)-a1*(x4-x9)+a2*(x5-x8)-a4*(x6-x7) = S6 , which is in proper 6-convolution form.
	Equivalently, we relabel using ordered s-indices: a[0,1,4,2,3,5] = s[1,2,3,4,5,6], which gives
	+a0*(x1-x12)+a1*(x2-x11)+a4*(x3-x10)+a2*(x4-x9)+a3*(x5-x8)+a5*(x6-x7) = S1
	+a1*(x1-x12)+a2*(x2-x11)+a5*(x3-x10)-a3*(x4-x9)-a4*(x5-x8)-a0*(x6-x7) = S2
	+a4*(x1-x12)+a5*(x2-x11)-a2*(x3-x10)-a0*(x4-x9)+a1*(x5-x8)+a3*(x6-x7) = S3
	+a2*(x1-x12)-a3*(x2-x11)-a0*(x3-x10)+a4*(x4-x9)-a5*(x5-x8)-a1*(x6-x7) = S4
	+a3*(x1-x12)-a4*(x2-x11)+a1*(x3-x10)-a5*(x4-x9)-a0*(x5-x8)+a2*(x6-x7) = S5
	+a5*(x1-x12)-a0*(x2-x11)+a3*(x3-x10)-a1*(x4-x9)+a2*(x5-x8)-a4*(x6-x7) = S6 ,
	and then shuffling rows so as to put the left-col a-indices in ascending order again reveals the required convo output-reordering:
	+a0*(x1-x12)+a1*(x2-x11)+a4*(x3-x10)+a2*(x4-x9)+a3*(x5-x8)+a5*(x6-x7) = S1
	+a1*(x1-x12)+a2*(x2-x11)+a5*(x3-x10)-a3*(x4-x9)-a4*(x5-x8)-a0*(x6-x7) = S2
	+a2*(x1-x12)-a3*(x2-x11)-a0*(x3-x10)+a4*(x4-x9)-a5*(x5-x8)-a1*(x6-x7) = S4
	+a3*(x1-x12)-a4*(x2-x11)+a1*(x3-x10)-a5*(x4-x9)-a0*(x5-x8)+a2*(x6-x7) = S5
	+a4*(x1-x12)+a5*(x2-x11)-a2*(x3-x10)-a0*(x4-x9)+a1*(x5-x8)+a3*(x6-x7) = S3
	+a5*(x1-x12)-a0*(x2-x11)+a3*(x3-x10)-a1*(x4-x9)+a2*(x5-x8)-a4*(x6-x7) = S6 .

	Q: Can we alwasy find a single-row-move which gives the desired convolution indexing pattern among the s-coeffs?
	A: No such perm works for N=13 and its length-6 subconvolution.
	*/

	/*
	Sine terms are:
	+s1*(x1-x12)+s2*(x2-x11)+s3*(x3-x10)+s4*(x4-x9)+s5*(x5-x8)+s6*(x6-x7) = S1
	+s2*(x1-x12)+s4*(x2-x11)+s6*(x3-x10)-s5*(x4-x9)-s3*(x5-x8)-s1*(x6-x7) = S2
	+s3*(x1-x12)+s6*(x2-x11)-s4*(x3-x10)-s1*(x4-x9)+s2*(x5-x8)+s5*(x6-x7) = S3
	+s4*(x1-x12)-s5*(x2-x11)-s1*(x3-x10)+s3*(x4-x9)-s6*(x5-x8)-s2*(x6-x7) = S4
	+s5*(x1-x12)-s3*(x2-x11)+s2*(x3-x10)-s6*(x4-x9)-s1*(x5-x8)+s4*(x6-x7) = S5
	+s6*(x1-x12)-s1*(x2-x11)+s5*(x3-x10)-s2*(x4-x9)+s4*(x5-x8)-s3*(x6-x7) = S6 .

	Letting s(1,2,3,4,5,6)
		  = a(0,5,2,4,3,1) gives

	+a0*(x1-x12)+a5*(x2-x11)+a2*(x3-x10)+a4*(x4-x9)+a3*(x5-x8)+a1*(x6-x7) = S1
	+a5*(x1-x12)+a4*(x2-x11)+a1*(x3-x10)-a3*(x4-x9)-a2*(x5-x8)-a0*(x6-x7) = S2
	+a2*(x1-x12)+a1*(x2-x11)-a4*(x3-x10)-a0*(x4-x9)+a5*(x5-x8)+a3*(x6-x7) = S3
	+a4*(x1-x12)-a3*(x2-x11)-a0*(x3-x10)+a2*(x4-x9)-a1*(x5-x8)-a5*(x6-x7) = S4
	+a3*(x1-x12)-a2*(x2-x11)+a5*(x3-x10)-a1*(x4-x9)-a0*(x5-x8)+a4*(x6-x7) = S5
	+a1*(x1-x12)-a0*(x2-x11)+a3*(x3-x10)-a5*(x4-x9)+a4*(x5-x8)-a2*(x6-x7) = S6 ,

	and rearranging the rows to reorder the a-terms of the leftmost column yields

	+a0*(x1-x12)+a5*(x2-x11)+a2*(x3-x10)+a4*(x4-x9)+a3*(x5-x8)+a1*(x6-x7) = S1
	+a1*(x1-x12)-a0*(x2-x11)+a3*(x3-x10)-a5*(x4-x9)+a4*(x5-x8)-a2*(x6-x7) = S6
	+a2*(x1-x12)+a1*(x2-x11)-a4*(x3-x10)-a0*(x4-x9)+a5*(x5-x8)+a3*(x6-x7) = S3
	+a3*(x1-x12)-a2*(x2-x11)+a5*(x3-x10)-a1*(x4-x9)-a0*(x5-x8)+a4*(x6-x7) = S5
	+a4*(x1-x12)-a3*(x2-x11)-a0*(x3-x10)+a2*(x4-x9)-a1*(x5-x8)-a5*(x6-x7) = S4
	+a5*(x1-x12)+a4*(x2-x11)+a1*(x3-x10)-a3*(x4-x9)-a2*(x5-x8)-a0*(x6-x7) = S2

	We can see that the columns have convolution index patterns, i.e. the six distinct
	circular shifts of the vector (0,1,2,3,4,5). Rearranging columns to make the a-indices
	strictly ascending (in the circular sense) from left to right within each row:

	S1 = +a0*(x1-x12)+a1*(x6-x7)+a2*(x3-x10)+a3*(x5-x8)+a4*(x4-x9)+a5*(x2-x11)
	S6 = +a1*(x1-x12)-a2*(x6-x7)+a3*(x3-x10)+a4*(x5-x8)-a5*(x4-x9)-a0*(x2-x11)
	S3 = +a2*(x1-x12)+a3*(x6-x7)-a4*(x3-x10)+a5*(x5-x8)-a0*(x4-x9)+a1*(x2-x11)
	S5 = +a3*(x1-x12)+a4*(x6-x7)+a5*(x3-x10)-a0*(x5-x8)-a1*(x4-x9)-a2*(x2-x11)
	S4 = +a4*(x1-x12)-a5*(x6-x7)-a0*(x3-x10)-a1*(x5-x8)+a2*(x4-x9)-a3*(x2-x11)
	S2 = +a5*(x1-x12)-a0*(x6-x7)+a1*(x3-x10)-a2*(x5-x8)-a3*(x4-x9)+a4*(x2-x11)

	Next, letting

	y0 = (x1-x12), y1 = (x6-x7), y2 = (x3-x10), y3 = (x5-x8), y4 = (x4-x9), y5 = (x2-x11),
	(and doing analogously for the cosine terms, except there we don't need to worry about signs) we get

	S1 = +a0*y0+a1*y1+a2*y2+a3*y3+a4*y4+a5*y5
	S6 = +a1*y0-a2*y1+a3*y2+a4*y3-a5*y4-a0*y5
	S3 = +a2*y0+a3*y1-a4*y2+a5*y3-a0*y4+a1*y5
	S5 = +a3*y0+a4*y1+a5*y2-a0*y3-a1*y4-a2*y5
	S4 = +a4*y0-a5*y1-a0*y2-a1*y3+a2*y4-a3*y5
	S2 = +a5*y0-a0*y1+a1*y2-a2*y3-a3*y4+a4*y5 , 15 of the 36 terms have a - sign.

	Thus the pattern of signs (0 = positive, 1 = negative) is, with LSB corr. to y0 and MSB to y4:
		000000 = 0x00
		110010 = 0x32
		010100 = 0x14
		111000 = 0x38
		101110 = 0x2e
		011010 = 0x1a .
	The minimum number of - signs we are able to achieve via some combo of s-term and x-term sign-flips is 9,
	much too large to make a cyclic-convolution-followed-by-added-terms-to-get-needed-minus-signs approach
	interesting ... instead it turns out to be feasible to sign-munge the above into a length-6 *acyclic* convo.
	*/
	#define cshift(x,i) (((x<<(6-i))&mask) + (x>>i)) // Rightward circular shift of 6-bit int x by i bits
	const uint64 rows = (0x32ull<<6)+(0x14ull<<12)+(0x38ull<<18)+(0x2eull<<24)+(0x1aull<<30);	// Concatenate rowwise bit patterns into uint64 ...
	const uint64 mask = 0x3f;											// this limits us to nc = 8 (N = 17), obviously.
	printf("N = 13:\n");
	for(i = 0; i < imax; i++) {	// Loop over all 2^nc sine-terms sign patterns
		mask1 = (uint64)i;	mask1 += (mask1<<6)+(mask1<<12)+(mask1<<18)+(mask1<<24)+(mask1<<30);	// This mask is same for each term of our convolution, so just duplicate 6 times
		for(j = 0; j < imax; j++) {	// Loop over all 2^nc x-terms sign patterns
			// This mask gets left-cshifted one more per row:
			mask2 = (uint64)j;	mask2 += (cshift(mask2,1)<<6)+(cshift(mask2,2)<<12)+(cshift(mask2,3)<<18)+(cshift(mask2,4)<<24)+(cshift(mask2,5)<<30);
			flip = rows ^ mask1 ^ mask2;	// Apply s-term and x-terms flips
			popt = 0;
			popc = pop8[(flip    ) & mask]; popc = MIN(popc, nc-popc); popt += popc;
			popc = pop8[(flip>> 6) & mask]; popc = MIN(popc, nc-popc); popt += popc;
			popc = pop8[(flip>>12) & mask]; popc = MIN(popc, nc-popc); popt += popc;
			popc = pop8[(flip>>18) & mask]; popc = MIN(popc, nc-popc); popt += popc;
			popc = pop8[(flip>>24) & mask]; popc = MIN(popc, nc-popc); popt += popc;
			popc = pop8[(flip>>30) & mask]; popc = MIN(popc, nc-popc); popt += popc;
			nums = popt;
			if(nums < mins) {
				mins = nums;
				printf("\tWith xflip = %llX, sflip = %llX: #sign-flips = %d\n",i,j,mins);
			}
		}
	}

#elif (N == 17)
  #if 0	// After we find the needed convo-izing output-perm, disable this code:
	const uint8 v0[] = {1,2,3,4,5,6,7,8}, v1[] = {2,4,6,8,7,5,3,1}, v2[] = {3,6,8,5,2,1,4,7}, v3[] = {4,8,5,1,3,7,6,2}, v4[] = {5,7,2,3,8,4,1,6}, v5[] = {6,5,1,7,4,2,8,3}, v6[] = {7,3,4,6,1,8,2,5}, v7[] = {8,1,7,2,6,3,5,4};
	uint8 vec0[8], vec1[8], vec2[8], vec3[8], vec4[8], vec5[8], vec6[8], vec7[8];
	uint32 k,jp, perm8[8];
	for(i = 0; i < 40320; i++) {	// Only permute tha last 7 elts of each of our 8-vecs
		j = nperm(perm8, i == 0, 8);	ASSERT(HERE, j != 0, "Ran out of perms!");
		for(j = 0; j < 8; j++) {
			jp = perm8[j];
			vec0[j] = v0[jp]; vec1[j] = v1[jp]; vec2[j] = v2[jp]; vec3[j] = v3[jp]; vec4[j] = v4[jp]; vec5[j] = v5[jp]; vec6[j] = v6[jp]; vec7[j] = v7[jp];
		}
/*
	printf("Trying perm[%u,%u,%u,%u,%u,%u,%u,%u]: vec0 = [%u,%u,%u,%u,%u,%u,%u,%u], vec1 = [%u,%u,%u,%u,%u,%u,%u,%u]\n",
		perm8[0],perm8[1],perm8[2],perm8[3],perm8[4],perm8[5],perm8[6],perm8[7],
		vec0[0],vec0[1],vec0[2],vec0[3],vec0[4],vec0[5],vec0[6],vec0[7],
		vec1[0],vec1[1],vec1[2],vec1[3],vec1[4],vec1[5],vec1[6],vec1[7]
	);
*/
		if(!vec_eq_modulo_cshift(vec0,vec1,nc))
			continue;
		if(!vec_eq_modulo_cshift(vec0,vec2,nc))
			continue;
		if(!vec_eq_modulo_cshift(vec0,vec3,nc))
			continue;
		if(!vec_eq_modulo_cshift(vec0,vec4,nc))
			continue;
		if(!vec_eq_modulo_cshift(vec0,vec5,nc))
			continue;
		if(!vec_eq_modulo_cshift(vec0,vec6,nc))
			continue;
		if(!vec_eq_modulo_cshift(vec0,vec7,nc))
			continue;
		printf("Found s-coeff perm:");
		for(j = 0; j < 8; j++) {
			printf("%u.",perm8[j]);
		}
		printf("\n");
	}
	exit(0);
  #endif
/*
Sine terms are:
	S1 = s1*(x1-xG)+s2*(x2-xF)+s3*(x3-xE)+s4*(x4-xD)+s5*(x5-xC)+s6*(x6-xB)+s7*(x7-xA)+s8*(x8-x9)
	S2 = s2*(x1-xG)+s4*(x2-xF)+s6*(x3-xE)+s8*(x4-xD)-s7*(x5-xC)-s5*(x6-xB)-s3*(x7-xA)-s1*(x8-x9)
	S3 = s3*(x1-xG)+s6*(x2-xF)-s8*(x3-xE)-s5*(x4-xD)-s2*(x5-xC)+s1*(x6-xB)+s4*(x7-xA)+s7*(x8-x9)
	S4 = s4*(x1-xG)+s8*(x2-xF)-s5*(x3-xE)-s1*(x4-xD)+s3*(x5-xC)+s7*(x6-xB)-s6*(x7-xA)-s2*(x8-x9)
	S5 = s5*(x1-xG)-s7*(x2-xF)-s2*(x3-xE)+s3*(x4-xD)+s8*(x5-xC)-s4*(x6-xB)+s1*(x7-xA)+s6*(x8-x9)
	S6 = s6*(x1-xG)-s5*(x2-xF)+s1*(x3-xE)+s7*(x4-xD)-s4*(x5-xC)+s2*(x6-xB)+s8*(x7-xA)-s3*(x8-x9)
	S7 = s7*(x1-xG)-s3*(x2-xF)+s4*(x3-xE)-s6*(x4-xD)+s1*(x5-xC)+s8*(x6-xB)-s2*(x7-xA)+s5*(x8-x9)
	S8 = s8*(x1-xG)-s1*(x2-xF)+s7*(x3-xE)-s2*(x4-xD)+s6*(x5-xC)-s3*(x6-xB)+s5*(x7-xA)-s4*(x8-x9)
Look at s-term index pattern for N=17 and its length-8 subconvolution:
	S1:	1+2+3+4+5+6+7+8
	S2:	2+4+6+8-7-5-3-1
	S3:	3+6-8-5-2+1+4+7
	S4:	4+8-5-1+3+7-6-2
	S5:	5-7-2+3+8-4+1+6
	S6:	6-5+1+7-4+2+8-3
	S7:	7-3+4-6+1+8-2+5
	S8:	8-1+7-2+6-3+5-4
Unlike N=11 and N=13, for N=17 there areo single-row-moves which convo-ize the s-index patterns,
so brute-force try all 8-perms ... here are the 32 (of 40320 possible 8-perms) which achieve the desired index patterning:
	[4,0,6,1,2,3,5,7]
	[6,3,4,1,5,0,2,7]
	[2,0,5,1,4,3,6,7]
	[5,3,2,1,6,0,4,7]
	[3,2,1,6,0,4,7,5]
	[0,6,1,2,3,5,7,4]
	[3,4,1,5,0,2,7,6]
	[0,5,1,4,3,6,7,2]
	[2,1,6,0,4,7,5,3]
	[6,1,2,3,5,7,4,0]
	[4,1,5,0,2,7,6,3]
	[5,1,4,3,6,7,2,0]
	[1,4,3,6,7,2,0,5]
	[1,2,3,5,7,4,0,6]
	[1,6,0,4,7,5,3,2]
	[1,5,0,2,7,6,3,4]
	[4,3,6,7,2,0,5,1]
	[2,3,5,7,4,0,6,1]
	[6,0,4,7,5,3,2,1]
	[5,0,2,7,6,3,4,1]
	[0,2,7,6,3,4,1,5]
	[3,6,7,2,0,5,1,4]
	[0,4,7,5,3,2,1,6]
	[3,5,7,4,0,6,1,2]
	[2,7,6,3,4,1,5,0]
	[6,7,2,0,5,1,4,3]
	[4,7,5,3,2,1,6,0]
	[5,7,4,0,6,1,2,3]
	[7,4,0,6,1,2,3,5]
	[7,6,3,4,1,5,0,2]
	[7,2,0,5,1,4,3,6]
	[7,5,3,2,1,6,0,4]
Here is the result of (say) the perm [0,6,1,2,3,5,7,4]:
	S1:	1+2+3+4+5+6+7+8
	S7:	7-3+4-6+1+8-2+5
	S2:	2+4+6+8-7-5-3-1
	S3:	3+6-8-5-2+1+4+7
	S4:	4+8-5-1+3+7-6-2
	S6:	6-5+1+7-4+2+8-3
	S8:	8-1+7-2+6-3+5-4
	S5:	5-7-2+3+8-4+1+6
In terms of out original variables, this is
	S1 = s1*(x1-xG)+s2*(x2-xF)+s3*(x3-xE)+s4*(x4-xD)+s5*(x5-xC)+s6*(x6-xB)+s7*(x7-xA)+s8*(x8-x9)
	S7 = s7*(x1-xG)-s3*(x2-xF)+s4*(x3-xE)-s6*(x4-xD)+s1*(x5-xC)+s8*(x6-xB)-s2*(x7-xA)+s5*(x8-x9)
	S2 = s2*(x1-xG)+s4*(x2-xF)+s6*(x3-xE)+s8*(x4-xD)-s7*(x5-xC)-s5*(x6-xB)-s3*(x7-xA)-s1*(x8-x9)
	S3 = s3*(x1-xG)+s6*(x2-xF)-s8*(x3-xE)-s5*(x4-xD)-s2*(x5-xC)+s1*(x6-xB)+s4*(x7-xA)+s7*(x8-x9)
	S4 = s4*(x1-xG)+s8*(x2-xF)-s5*(x3-xE)-s1*(x4-xD)+s3*(x5-xC)+s7*(x6-xB)-s6*(x7-xA)-s2*(x8-x9)
	S6 = s6*(x1-xG)-s5*(x2-xF)+s1*(x3-xE)+s7*(x4-xD)-s4*(x5-xC)+s2*(x6-xB)+s8*(x7-xA)-s3*(x8-x9)
	S8 = s8*(x1-xG)-s1*(x2-xF)+s7*(x3-xE)-s2*(x4-xD)+s6*(x5-xC)-s3*(x6-xB)+s5*(x7-xA)-s4*(x8-x9)
	S5 = s5*(x1-xG)-s7*(x2-xF)-s2*(x3-xE)+s3*(x4-xD)+s8*(x5-xC)-s4*(x6-xB)+s1*(x7-xA)+s6*(x8-x9)
Letting s(1,7,2,3,4,6,8,5) = a0-7 gives
	S1 = a0*(x1-xG)+a2*(x2-xF)+a3*(x3-xE)+a4*(x4-xD)+a7*(x5-xC)+a5*(x6-xB)+a1*(x7-xA)+a6*(x8-x9)
	S7 = a1*(x1-xG)-a3*(x2-xF)+a4*(x3-xE)-a5*(x4-xD)+a0*(x5-xC)+a6*(x6-xB)-a2*(x7-xA)+a7*(x8-x9)
	S2 = a2*(x1-xG)+a4*(x2-xF)+a5*(x3-xE)+a6*(x4-xD)-a1*(x5-xC)-a7*(x6-xB)-a3*(x7-xA)-a0*(x8-x9)
	S3 = a3*(x1-xG)+a5*(x2-xF)-a6*(x3-xE)-a7*(x4-xD)-a2*(x5-xC)+a0*(x6-xB)+a4*(x7-xA)+a1*(x8-x9)
	S4 = a4*(x1-xG)+a6*(x2-xF)-a7*(x3-xE)-a0*(x4-xD)+a3*(x5-xC)+a1*(x6-xB)-a5*(x7-xA)-a2*(x8-x9)
	S6 = a5*(x1-xG)-a7*(x2-xF)+a0*(x3-xE)+a1*(x4-xD)-a4*(x5-xC)+a2*(x6-xB)+a6*(x7-xA)-a3*(x8-x9)
	S8 = a6*(x1-xG)-a0*(x2-xF)+a1*(x3-xE)-a2*(x4-xD)+a5*(x5-xC)-a3*(x6-xB)+a7*(x7-xA)-a4*(x8-x9)
	S5 = a7*(x1-xG)-a1*(x2-xF)-a2*(x3-xE)+a3*(x4-xD)+a6*(x5-xC)-a4*(x6-xB)+a0*(x7-xA)+a5*(x8-x9)
We can see that the columns have convolution index patterns, i.e. the eight distinct
circular shifts of the vector (0,1,2,3,4,5,6,7). Rearranging columns to make the a-indices
strictly ascending (in the circular sense) from left to right within each row:
	S1 = a0*(x1-xG)+a1*(x7-xA)+a2*(x2-xF)+a3*(x3-xE)+a4*(x4-xD)+a5*(x6-xB)+a6*(x8-x9)+a7*(x5-xC)
	S7 = a1*(x1-xG)-a2*(x7-xA)-a3*(x2-xF)+a4*(x3-xE)-a5*(x4-xD)+a6*(x6-xB)+a7*(x8-x9)+a0*(x5-xC)
	S2 = a2*(x1-xG)-a3*(x7-xA)+a4*(x2-xF)+a5*(x3-xE)+a6*(x4-xD)-a7*(x6-xB)-a0*(x8-x9)-a1*(x5-xC)
	S3 = a3*(x1-xG)+a4*(x7-xA)+a5*(x2-xF)-a6*(x3-xE)-a7*(x4-xD)+a0*(x6-xB)+a1*(x8-x9)-a2*(x5-xC)
	S4 = a4*(x1-xG)-a5*(x7-xA)+a6*(x2-xF)-a7*(x3-xE)-a0*(x4-xD)+a1*(x6-xB)-a2*(x8-x9)+a3*(x5-xC)
	S6 = a5*(x1-xG)+a6*(x7-xA)-a7*(x2-xF)+a0*(x3-xE)+a1*(x4-xD)+a2*(x6-xB)-a3*(x8-x9)-a4*(x5-xC)
	S8 = a6*(x1-xG)+a7*(x7-xA)-a0*(x2-xF)+a1*(x3-xE)-a2*(x4-xD)-a3*(x6-xB)-a4*(x8-x9)+a5*(x5-xC)
	S5 = a7*(x1-xG)+a0*(x7-xA)-a1*(x2-xF)-a2*(x3-xE)+a3*(x4-xD)-a4*(x6-xB)+a5*(x8-x9)+a6*(x5-xC)
Next, letting

y0 = (x1-xG), y1 = (x7-xA), y2 = (x2-xF), y3 = (x3-xE), y4 = (x4-xD), y5 = (x6-xB), y6 = (x8-x9), y7 = (x5-xC)
(and doing analogously for the cosine terms, except there we don't need to worry about signs) we get
	x-terms ordering by columns: 17234685
	S1 = a0*y0+a1*y1+a2*y2+a3*y3+a4*y4+a5*y5+a6*y6+a7*y7
	S7 = a1*y0-a2*y1-a3*y2+a4*y3-a5*y4+a6*y5+a7*y6+a0*y7
	S2 = a2*y0-a3*y1+a4*y2+a5*y3+a6*y4-a7*y5-a0*y6-a1*y7
	S3 = a3*y0+a4*y1+a5*y2-a6*y3-a7*y4+a0*y5+a1*y6-a2*y7
	S4 = a4*y0-a5*y1+a6*y2-a7*y3-a0*y4+a1*y5-a2*y6+a3*y7
	S6 = a5*y0+a6*y1-a7*y2+a0*y3+a1*y4+a2*y5-a3*y6-a4*y7
	S8 = a6*y0+a7*y1-a0*y2+a1*y3-a2*y4-a3*y5-a4*y6+a5*y7
	S5 = a7*y0+a0*y1-a1*y2-a2*y3+a3*y4-a4*y5+a5*y6+a6*y7 , 24 of 64 terms (37.5%) have - sign.

Next look at the pattern of signs (0 = positive, 1 = negative) is, with lSB corr. to y0 and MSB to y7:
#-:		58643271, note these are in reverse order due to LSB-MSB-ness
0	S1:	00000000 = 0x00
3	S7:	00010110 = 0x16
4	S2:	11100010 = 0xe2
3	S3:	10011000 = 0x98
4	S4:	01011010 = 0x5a
3	S6:	11000100 = 0xc4
4	S8:	01110100 = 0x74
3	S5:	00101100 = 0x2c
The best we are able to do w.r.to minimizing the number of - signs is sflip = xflip = 0x3, which gives 16 -signs:
and corresponds to flipping signs of a0,a1 and y0,y1:		#- signs in row:
	S1 = +a0*y0+a1*y1+a2*y2+a3*y3+a4*y4+a5*y5+a6*y6+a7*y7	0
	S7 = +a1*y0+a2*y1-a3*y2+a4*y3-a5*y4+a6*y5+a7*y6-a0*y7	3
	S2 = -a2*y0+a3*y1+a4*y2+a5*y3+a6*y4-a7*y5+a0*y6+a1*y7	2
	S3 = -a3*y0-a4*y1+a5*y2-a6*y3-a7*y4-a0*y5-a1*y6-a2*y7	7 (i.e. 1 with -S3)
	S4 = -a4*y0+a5*y1+a6*y2-a7*y3+a0*y4-a1*y5-a2*y6+a3*y7	4
	S6 = -a5*y0-a6*y1-a7*y2-a0*y3-a1*y4+a2*y5-a3*y6-a4*y7	7 (i.e. 1 with -S6)
	S8 = -a6*y0-a7*y1+a0*y2-a1*y3-a2*y4-a3*y5-a4*y6+a5*y7	6 (i.e. 2 with -S8)
	S5 = -a7*y0+a0*y1+a1*y2-a2*y3+a3*y4-a4*y5+a5*y6+a6*y7	3, total = 0+3+2+1+4+1+2+3 = 16.
Q: Can we instead convert into an acyclic convo? That would mean rows with #-signs = 0,1,2,3,4,5,6,7 for a total 28 -signs,
i.e. need a net gain of just 4 - signs.
How to systematize the search? Use same code we do to minimize - sign count, but now make search criterion
a sign patterning such that the MIN(popc, nc-popc) numbers are 0,1,1,2,2,3,3,4, totaling 16.
OH, WAIT - In fact that is *precisely* the sign pattern our above best-result for the cyclic-convo search gives!
Just need to flip the signs of each output rows S3 and S5:
	Flipped signs of a0,a1 and y0,y1:						#- signs in row:
	S1 = +a0*y0+a1*y1+a2*y2+a3*y3+a4*y4+a5*y5+a6*y6+a7*y7	0
	S7 = +a1*y0+a2*y1-a3*y2+a4*y3-a5*y4+a6*y5+a7*y6-a0*y7	3
	S2 = -a2*y0+a3*y1+a4*y2+a5*y3+a6*y4-a7*y5+a0*y6+a1*y7	2
-	S3 = +a3*y0+a4*y1-a5*y2+a6*y3+a7*y4+a0*y5+a1*y6+a2*y7	1
	S4 = -a4*y0+a5*y1+a6*y2-a7*y3+a0*y4-a1*y5-a2*y6+a3*y7	4
	S6 = -a5*y0-a6*y1-a7*y2-a0*y3-a1*y4+a2*y5-a3*y6-a4*y7	7
	S8 = -a6*y0-a7*y1+a0*y2-a1*y3-a2*y4-a3*y5-a4*y6+a5*y7	6
-	S5 = +a7*y0-a0*y1-a1*y2+a2*y3-a3*y4+a4*y5-a5*y6-a6*y7	5
Only problem - Need *another* permutation of a-indices to put the - sign counts in ascending order.
Q: Does one of the acyclic-convo-sign-pattern sign-flips we found automatically order the - sign counts in ascending fashion?
A: Yes: xflip = 4, sflip = 4: pop[0,1,2,3,4,3,2,1] ... That means -a2,-y2:
	Flipped signs of a2,y2:									#- signs in row:
	S1 = +a0*y0+a1*y1+a2*y2+a3*y3+a4*y4+a5*y5+a6*y6+a7*y7	0
	S7 = +a1*y0+a2*y1+a3*y2+a4*y3-a5*y4+a6*y5+a7*y6+a0*y7	1
-	S2 = +a2*y0+a3*y1+a4*y2-a5*y3-a6*y4+a7*y5+a0*y6+a1*y7	2
	S3 = +a3*y0+a4*y1-a5*y2-a6*y3-a7*y4+a0*y5+a1*y6+a2*y7	3
	S4 = +a4*y0-a5*y1-a6*y2-a7*y3-a0*y4+a1*y5+a2*y6+a3*y7	4
-	S6 = -a5*y0-a6*y1-a7*y2-a0*y3-a1*y4+a2*y5+a3*y6+a4*y7	5
-	S8 = -a6*y0-a7*y1-a0*y2-a1*y3-a2*y4+a3*y5+a4*y6-a5*y7	6
-	S5 = -a7*y0-a0*y1-a1*y2-a2*y3-a3*y4+a4*y5-a5*y6-a6*y7	7
Col #- :    3     4     5     6     7     0     1     2			<***** problem! pattern of - signs not same in cols as rows *****
Rearrange cols to put #- signs in proper acyclic columnwise ascending-order:
															Row #-:
	S1 = +a5*y5+a6*y6+a7*y7+a0*y0+a1*y1+a2*y2+a3*y3+a4*y4	0
	S7 = +a6*y5+a7*y6+a0*y7+a1*y0+a2*y1+a3*y2+a4*y3-a5*y4	1
-	S2 = +a7*y5+a0*y6+a1*y7+a2*y0+a3*y1+a4*y2-a5*y3-a6*y4	2
	S3 = +a0*y5+a1*y6+a2*y7+a3*y0+a4*y1-a5*y2-a6*y3-a7*y4	3
	S4 = +a1*y5+a2*y6+a3*y7+a4*y0-a5*y1-a6*y2-a7*y3-a0*y4	4
-	S6 = +a2*y5+a3*y6+a4*y7-a5*y0-a6*y1-a7*y2-a0*y3-a1*y4	5
-	S8 = +a3*y5+a4*y6-a5*y7-a6*y0-a7*y1-a0*y2-a1*y3-a2*y4	6
-	S5 = +a4*y5-a5*y6-a6*y7-a7*y0-a0*y1-a1*y2-a2*y3-a3*y4	7
Col #- :    0     1     2     3     4     5     6     7
...and finally, proper acyclic form has Row #- counts running in opposite (descending) order relative to above,
so simply need to reverse output ordering, which leaves the coeff-patterns-are-cshifts-of-each-other property unaffected:
															Row #-:
7-	S5 = +a4*y5-a5*y6-a6*y7-a7*y0-a0*y1-a1*y2-a2*y3-a3*y4	7
6-	S8 = +a3*y5+a4*y6-a5*y7-a6*y0-a7*y1-a0*y2-a1*y3-a2*y4	6
5-	S6 = +a2*y5+a3*y6+a4*y7-a5*y0-a6*y1-a7*y2-a0*y3-a1*y4	5
4	S4 = +a1*y5+a2*y6+a3*y7+a4*y0-a5*y1-a6*y2-a7*y3-a0*y4	4
3	S3 = +a0*y5+a1*y6+a2*y7+a3*y0+a4*y1-a5*y2-a6*y3-a7*y4	3
2-	S2 = +a7*y5+a0*y6+a1*y7+a2*y0+a3*y1+a4*y2-a5*y3-a6*y4	2
1	S7 = +a6*y5+a7*y6+a0*y7+a1*y0+a2*y1+a3*y2+a4*y3-a5*y4	1
0	S1 = +a5*y5+a6*y6+a7*y7+a0*y0+a1*y1+a2*y2+a3*y3+a4*y4	0
Col #- :    0     1     2     3     4     5     6     7
Lastly, relabel as b0-7 = a43210765 and z0-7 = y56701234:
															Row #-:
7-	S5 = +b0*z0-b7*z1-b6*z2-b5*z3-b4*z4-b3*z5-b2*z6-b1*z7	7
6-	S8 = +b1*z0+b0*z1-b7*z2-b6*z3-b5*z4-b4*z5-b3*z6-b2*z7	6
5-	S6 = +b2*z0+b1*z1+b0*z2-b7*z3-b6*z4-b5*z5-b4*z6-b3*z7	5
4	S4 = +b3*z0+b2*z1+b1*z2+b0*z3-b7*z4-b6*z5-b5*z6-b4*z7	4
3	S3 = +b4*z0+b3*z1+b2*z2+b1*z3+b0*z4-b7*z5-b6*z6-b5*z7	3
2-	S2 = +b5*z0+b4*z1+b3*z2+b2*z3+b1*z4+b0*z5-b7*z6-b6*z7	2
1	S7 = +b6*z0+b5*z1+b4*z2+b3*z3+b2*z4+b1*z5+b0*z6-b7*z7	1
0	S1 = +b7*z0+b6*z1+b5*z2+b4*z3+b3*z4+b2*z5+b1*z6+b0*z7	0
Col #- :    0     1     2     3     4     5     6     7
*/
	#define cshift(x,i) (((x<<(8-i))&mask) + (x>>i)) // Rightward circular shift of 8-bit int x by i bits
	const uint64 rows = (0x2cull<<56)+(0x74ull<<48)+(0xc4ull<<40)+(0x5aull<<32)+(0x98ull<<24)+(0xe2ull<<16)+(0x16ull<<8)+0x00;
	const uint64 mask = 0xff;	// Use to maks off 1 row at a time in concatenated rows-int for bit counting
	uint8 pop[8];	// For acyclic-convo search
	printf("N = 17:\n");
	for(i = 0; i < imax; i++) {	// Loop over all 2^nc sine-terms sign patterns
		mask1 = (uint64)i;	mask1 += (mask1<<8)+(mask1<<16)+(mask1<<24)+(mask1<<32)+(mask1<<40)+(mask1<<48)+(mask1<<56);	// This mask is same for each term of our convolution, so just duplicate 8 times
		for(j = 0; j < imax; j++) {	// Loop over all 2^nc x-terms sign patterns
			// This mask gets left-cshifted one more per row:
			mask2 = (uint64)j;	mask2 += (cshift(mask2,1)<<8)+(cshift(mask2,2)<<16)+(cshift(mask2,3)<<24)+(cshift(mask2,4)<<32)+(cshift(mask2,5)<<40)+(cshift(mask2,6)<<48)+(cshift(mask2,7)<<56);
			flip = rows ^ mask1 ^ mask2;	// Apply s-term and x-terms flips
			popt = 0;
			popc = pop8[(flip    ) & mask]; popc = MIN(popc, nc-popc); popt += popc; pop[0] = popc;
			popc = pop8[(flip>> 8) & mask]; popc = MIN(popc, nc-popc); popt += popc; pop[1] = popc;
			popc = pop8[(flip>>16) & mask]; popc = MIN(popc, nc-popc); popt += popc; pop[2] = popc;
			popc = pop8[(flip>>24) & mask]; popc = MIN(popc, nc-popc); popt += popc; pop[3] = popc;
			popc = pop8[(flip>>32) & mask]; popc = MIN(popc, nc-popc); popt += popc; pop[4] = popc;
			popc = pop8[(flip>>40) & mask]; popc = MIN(popc, nc-popc); popt += popc; pop[5] = popc;
			popc = pop8[(flip>>48) & mask]; popc = MIN(popc, nc-popc); popt += popc; pop[6] = popc;
			popc = pop8[(flip>>56) & mask]; popc = MIN(popc, nc-popc); popt += popc; pop[7] = popc;
			nums = popt;
		#if 1
			#warning Doing Cyclic-convo search:
			// Cyclic-convo search:
			if(nums < mins) {
				mins = nums;
				printf("\tWith xflip = %llX, sflip = %llX: #sign-flips = %d\n",i,j,mins);
			}
		#else
			#warning Doing Acyclic-convo search:
			// Acyclic-convo search - user should look at printed candidates for popcounts = some permutation of [0,1,1,2,2,3,3,4]:
			if(nums == 16) {
				printf("\tWith xflip = %llX, sflip = %llX: pop[%u,%u,%u,%u,%u,%u,%u,%u]\n",i,j,
					pop[0],pop[1],pop[2],pop[3],pop[4],pop[5],pop[6],pop[7]);
			}
		#endif
		}
	}

#else

	#error This DFT length not (yet) supported! If prime, must add code to do so.

#endif
	return 0;
}
