mingw-w64-crt/math/DFP/mpdecimal-2.3/tests/test_transpose.c - mingw-w64 - Git at Google

 /*
  * Copyright (c) 2008-2010 Stefan Krah. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */


 #include "mpdecimal.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <limits.h>
 #include <assert.h>
 #include "bits.h"
 #include "constants.h"
 #include "mptypes.h"
 #include "typearith.h"
 #include "transpose.h"
 #include "malloc_fail.h"


 /*
  * All the following is pasted from transpose.c for the sole reason of
  * running the tests with uint8_t arrays, which allow greater sizes.
  */


 #define BUFSIZE 4096
 #define SIDE 128


 static inline void pointerswap_c(uint8_t **a, uint8_t **b)
 {
 	uint8_t *tmp;

 	tmp = *b;
 	*b = *a;
 	*a = tmp;
 }

 /* Definition of the matrix transpose */
 static void
 std_trans_c(uint8_t dest[], uint8_t src[], mpd_size_t rows, mpd_size_t cols)
 {
 	mpd_size_t idest, isrc;
 	mpd_size_t r, c;

 	for (r = 0; r < rows; r++) {
 		isrc = r * cols;
 		idest = r;
 		for (c = 0; c < cols; c++) {
 			dest[idest] = src[isrc];
 			isrc += 1;
 			idest += rows;
 		}
 	}
 }

 /*
  * Swap half-rows of 2^n * (2*2^n) matrix.
  * FORWARD_CYCLE: even/odd permutation of the halfrows.
  * BACKWARD_CYCLE: reverse the even/odd permutation.
  */
 static int
 swap_halfrows_pow2_c(uint8_t *matrix, mpd_size_t rows, mpd_size_t cols, int dir)
 {
 	uint8_t buf1[BUFSIZE];
 	uint8_t buf2[BUFSIZE];
 	uint8_t *readbuf, *writebuf, *hp;
 	mpd_size_t *done, dbits;
 	mpd_size_t b = BUFSIZE, stride;
 	mpd_size_t hn, hmax; /* halfrow number */
 	mpd_size_t m, r=0;
 	mpd_size_t offset;
 	mpd_size_t next;


 	assert(cols == mul_size_t(2, rows));

 	if (dir == FORWARD_CYCLE) {
 		r = rows;
 	}
 	else if (dir == BACKWARD_CYCLE) {
 		r = 2;
 	}
 	else {
 		mpd_err_fatal("swap_halfrows_pow2_c: illegal argument for \"dir\"");
 	}

 	m = cols - 1;
 	hmax = rows; /* cycles start at odd halfrows */
 	dbits = 8 * sizeof *done;
 	if ((done = mpd_calloc(hmax/(sizeof *done) + 1, sizeof *done)) == NULL) {
 		return 0;
 	}

 	for (hn = 1; hn <= hmax; hn += 2) {

 		if (done[hn/dbits] & mpd_bits[hn%dbits]) {
 			continue;
 		}

 		readbuf = buf1; writebuf = buf2;

 		for (offset = 0; offset < cols/2; offset += b) {

 			stride = (offset + b < cols/2) ? b : cols/2-offset;

 			hp = matrix + hn*cols/2;
 			memcpy(readbuf, hp+offset, stride*(sizeof *readbuf));
 			pointerswap_c(&readbuf, &writebuf);

 			next = mulmod_size_t(hn, r, m);
 			hp = matrix + next*cols/2;

 			while (next != hn) {

 				memcpy(readbuf, hp+offset, stride*(sizeof *readbuf));
 				memcpy(hp+offset, writebuf, stride*(sizeof *writebuf));
 				pointerswap_c(&readbuf, &writebuf);

 				done[next/dbits] |= mpd_bits[next%dbits];

 				next = mulmod_size_t(next, r, m);
 			        hp = matrix + next*cols/2;

 			}

 			memcpy(hp+offset, writebuf, stride*(sizeof *writebuf));

 			done[hn/dbits] |= mpd_bits[hn%dbits];
 		}
 	}

 	__mingw_dfp_get_globals()->mpd_free(done);
 	return 1;
 }

 /* In-place transpose of a square matrix */
 static inline void
 squaretrans_c(uint8_t *buf, mpd_size_t cols)
 {
 	uint8_t tmp;
 	mpd_size_t idest, isrc;
 	mpd_size_t r, c;

 	for (r = 0; r < cols; r++) {
 		c = r+1;
 		isrc = r*cols + c;
 		idest = c*cols + r;
 		for (c = r+1; c < cols; c++) {
 			tmp = buf[isrc];
 			buf[isrc] = buf[idest];
 			buf[idest] = tmp;
 			isrc += 1;
 			idest += cols;
 		}
 	}
 }

 /*
  * Transpose 2^n * 2^n matrix. For cache efficiency, the matrix is split into
  * square blocks with side length 'SIDE'. First, the blocks are transposed,
  * then a square tranposition is done on each individual block.
  */
 static void
 squaretrans_pow2_c(uint8_t *matrix, mpd_size_t size)
 {
 	uint8_t buf1[SIDE*SIDE];
 	uint8_t buf2[SIDE*SIDE];
 	uint8_t *to, *from;
 	mpd_size_t b = size;
 	mpd_size_t r, c;
 	mpd_size_t i;

 	while (b > SIDE) b >>= 1;

 	for (r = 0; r < size; r += b) {

 		for (c = r; c < size; c += b) {

 			from = matrix + r*size + c;
 			to = buf1;
 			for (i = 0; i < b; i++) {
 				memcpy(to, from, b*(sizeof *to));
 				from += size;
 				to += b;
 			}
 			squaretrans_c(buf1, b);

 			if (r == c) {
 				to = matrix + r*size + c;
 				from = buf1;
 				for (i = 0; i < b; i++) {
 					memcpy(to, from, b*(sizeof *to));
 					from += b;
 					to += size;
 				}
 				continue;
 			}
 			else {
 				from = matrix + c*size + r;
 				to = buf2;
 				for (i = 0; i < b; i++) {
 					memcpy(to, from, b*(sizeof *to));
 					from += size;
 					to += b;
 				}
 				squaretrans_c(buf2, b);

 				to = matrix + c*size + r;
 				from = buf1;
 				for (i = 0; i < b; i++) {
 					memcpy(to, from, b*(sizeof *to));
 					from += b;
 					to += size;
 				}

 				to = matrix + r*size + c;
 				from = buf2;
 				for (i = 0; i < b; i++) {
 					memcpy(to, from, b*(sizeof *to));
 					from += b;
 					to += size;
 				}
 			}
 		}
 	}

 }

 /*
  * In-place transposition of a 2^n x 2^n or a 2^n x (2*2^n)
  * or a (2*2^n) x 2^n matrix.
  */
 static int
 transpose_pow2_c(uint8_t *matrix, mpd_size_t rows, mpd_size_t cols)
 {
 	mpd_size_t size = mul_size_t(rows, cols);

 	assert(ispower2(rows));
 	assert(ispower2(cols));

 	if (cols == rows) {
 		squaretrans_pow2_c(matrix, rows);
 	}
 	else if (cols == mul_size_t(2, rows)) {
 		if (!swap_halfrows_pow2_c(matrix, rows, cols, FORWARD_CYCLE)) {
 			return 0;
 		}
 		squaretrans_pow2_c(matrix, rows);
 		squaretrans_pow2_c(matrix+(size/2), rows);
 	}
 	else if (rows == mul_size_t(2, cols)) {
 		squaretrans_pow2_c(matrix, cols);
 		squaretrans_pow2_c(matrix+(size/2), cols);
 		if (!swap_halfrows_pow2_c(matrix, cols, rows, BACKWARD_CYCLE)) {
 			return 0;
 		}
 	}
 	else {
 		mpd_err_fatal("transpose_pow2_c: illegal matrix size");
 	}

 	return 1;
 }


 #include <time.h>
 #define UMOD_ARRAY ((mpd_uint_t)1<<28)
 #define UCHAR_ARRAY ((mpd_uint_t)1<<29)

 /* The faster in-place functions are tested against std_trans(). */
 static int
 testit_uint(void (* func)(mpd_uint_t *, mpd_size_t, mpd_size_t), mpd_size_t rows, mpd_size_t cols)
 {
 	mpd_uint_t *a = NULL, *src = NULL, *dest = NULL;
 	clock_t start_fast, end_fast, start_std, end_std;
 	mpd_size_t msize;
 	mpd_size_t i;
 	int ret = 1;

 	msize = mul_size_t(rows, cols);
 	if ((a = mpd_alloc(msize, sizeof *a)) == NULL) {
 		goto error;
 	}
 	if ((src = mpd_alloc(msize, sizeof *src)) == NULL) {
 		goto error;
 	}
 	if ((dest = mpd_alloc(msize, sizeof *dest)) == NULL) {
 		goto error;
 	}

 	for (i = 0; i < msize; i++) {
 		a[i] = src[i] = random();
 	}

 	start_std= clock();
 	std_trans(dest, src, rows, cols);
 	end_std= clock();

 	start_fast = clock();
 	func(a, rows, cols);
 	end_fast = clock();

 	for (i = 0; i < msize; i++) {
 		if (a[i] != dest[i]) {
 			fprintf(stderr, "FAIL: a[%"PRI_mpd_size_t"] = %"PRI_mpd_uint_t
                                         "\tdest[%"PRI_mpd_size_t"] = %"PRI_mpd_uint_t"\n",
 			        i, a[i], i, dest[i]);
 			exit(1);
 		}
 	}

 	fprintf(stderr, "size: %10"PRI_mpd_size_t"\tstd_trans: %6.2f sec\t "
                         "in_place_trans: %6.2f sec\n",
 		msize,
 		(double)(end_std-start_std)/(double)CLOCKS_PER_SEC,
 		(double)(end_fast-start_fast)/(double)CLOCKS_PER_SEC);


 out:
 	if (a) __mingw_dfp_get_globals()->mpd_free(a);
 	if (src) __mingw_dfp_get_globals()->mpd_free(src);
 	if (dest) __mingw_dfp_get_globals()->mpd_free(dest);
 	return ret;

 error:
 	ret = 0;
 	goto out;
 }

 /* The faster in-place functions are tested against std_trans(). */
 static int
 testit_uchar(void (* func)(uint8_t *, mpd_size_t, mpd_size_t), mpd_size_t rows, mpd_size_t cols)
 {
 	uint8_t *a = NULL, *src = NULL, *dest = NULL;
 	clock_t start_fast, end_fast, start_std, end_std;
 	mpd_size_t msize;
 	mpd_size_t i;
 	int ret = 1;

 	msize = mul_size_t(rows, cols);
 	if ((a = mpd_alloc(msize, sizeof *a)) == NULL) {
 		goto error;
 	}
 	if ((src = mpd_alloc(msize, sizeof *src)) == NULL) {
 		goto error;
 	}
 	if ((dest = mpd_alloc(msize, sizeof *dest)) == NULL) {
 		goto error;
 	}

 	for (i = 0; i < msize; i++) {
 		a[i] = src[i] = random();
 	}

 	start_std= clock();
 	std_trans_c(dest, src, rows, cols);
 	end_std= clock();

 	start_fast = clock();
 	func(a, rows, cols);
 	end_fast = clock();

 	for (i = 0; i < msize; i++) {
 		if (a[i] != dest[i]) {
 			fprintf(stderr, "FAIL: a[%"PRI_mpd_size_t"] = %d\t"
                                         "dest[%"PRI_mpd_size_t"] = %d\n",
 			        i, a[i], i, dest[i]);
 			exit(1);
 		}
 	}

 	fprintf(stderr, "size: %10"PRI_mpd_size_t"\tstd_trans: %6.2f sec\t "
                         "in_place_trans: %6.2f sec\n",
 		msize,
 		(double)(end_std-start_std)/(double)CLOCKS_PER_SEC,
 		(double)(end_fast-start_fast)/(double)CLOCKS_PER_SEC);


 out:
 	if (a) __mingw_dfp_get_globals()->mpd_free(a);
 	if (src) __mingw_dfp_get_globals()->mpd_free(src);
 	if (dest) __mingw_dfp_get_globals()->mpd_free(dest);
 	return ret;

 error:
 	ret = 0;
 	goto out;
 }


 #define transfunc_uint (void (*)(mpd_uint_t *, mpd_size_t, mpd_size_t))
 #define transfunc_uchar (void (*)(uint8_t *, mpd_size_t, mpd_size_t))


 int main(void)
 {
 	mpd_size_t rows, cols;


 	fprintf(stderr, "Running test_transpose ... \n");

 	fprintf(stderr, "\n2^n * 2^n mpd_uint_t matrices:\n\n");
 	for (rows = 1; mul_size_t(rows, rows) <= UMOD_ARRAY; rows *= 2) {
 		if (!testit_uint(transfunc_uint transpose_pow2, rows, rows)) {
 			break;
 		}
 	}

 	fprintf(stderr, "\n2^n * 2*2^n mpd_uint_t matrices:\n\n");
 	for (rows = 8, cols = 16; mul_size_t(rows, cols) <= UMOD_ARRAY; rows *= 2, cols*=2) {
 		if (!testit_uint(transfunc_uint transpose_pow2, rows, cols)) {
 			break;
 		}
 	}

 	fprintf(stderr, "\n2*2^n * 2^n mpd_uint_t matrices:\n\n");
 	for (rows = 16, cols = 8; mul_size_t(rows, cols) <= UMOD_ARRAY; rows *= 2, cols*=2) {
 		if (!testit_uint(transfunc_uint transpose_pow2, rows, cols)) {
 			break;
 		}
 	}


 	fprintf(stderr, "\n2^n * 2^n uint8_t matrices:\n\n");
 	for (rows = 1; mul_size_t(rows, rows) <= UCHAR_ARRAY; rows *= 2) {
 		if (!testit_uchar(transfunc_uchar transpose_pow2_c, rows, rows)) {
 			break;
 		}
 	}

 	fprintf(stderr, "\n2^n * 2*2^n uint8_t matrices:\n\n");
 	for (rows = 8, cols = 16; mul_size_t(rows, cols) <= UCHAR_ARRAY; rows *= 2, cols*=2) {
 		if (!testit_uchar(transfunc_uchar transpose_pow2_c, rows, cols)) {
 			break;
 		}
 	}

 	fprintf(stderr, "\n2*2^n * 2^n uint8_t matrices:\n\n");
 	for (rows = 16, cols = 8; mul_size_t(rows, cols) <= UCHAR_ARRAY; rows *= 2, cols*=2) {
 		if (!testit_uchar(transfunc_uchar transpose_pow2_c, rows, cols)) {
 			break;
 		}
 	}

 	fprintf(stderr, "\ntest_transpose: PASS\n\n");

 	return 0;
 }
	/*
	* Copyright (c) 2008-2010 Stefan Krah. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	*
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	*
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/


	#include "mpdecimal.h"
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>
	#include <limits.h>
	#include <assert.h>
	#include "bits.h"
	#include "constants.h"
	#include "mptypes.h"
	#include "typearith.h"
	#include "transpose.h"
	#include "malloc_fail.h"



	/*
	* All the following is pasted from transpose.c for the sole reason of
	* running the tests with uint8_t arrays, which allow greater sizes.
	*/


	#define BUFSIZE 4096
	#define SIDE 128


	static inline void pointerswap_c(uint8_t a, uint8_t b)
	{
	uint8_t *tmp;

	tmp = *b;
	b = a;
	*a = tmp;
	}

	/* Definition of the matrix transpose */
	static void
	std_trans_c(uint8_t dest[], uint8_t src[], mpd_size_t rows, mpd_size_t cols)
	{
	mpd_size_t idest, isrc;
	mpd_size_t r, c;

	for (r = 0; r < rows; r++) {
	isrc = r * cols;
	idest = r;
	for (c = 0; c < cols; c++) {
	dest[idest] = src[isrc];
	isrc += 1;
	idest += rows;
	}
	}
	}

	/*
	* Swap half-rows of 2^n * (2*2^n) matrix.
	* FORWARD_CYCLE: even/odd permutation of the halfrows.
	* BACKWARD_CYCLE: reverse the even/odd permutation.
	*/
	static int
	swap_halfrows_pow2_c(uint8_t *matrix, mpd_size_t rows, mpd_size_t cols, int dir)
	{
	uint8_t buf1[BUFSIZE];
	uint8_t buf2[BUFSIZE];
	uint8_t readbuf, writebuf, *hp;
	mpd_size_t *done, dbits;
	mpd_size_t b = BUFSIZE, stride;
	mpd_size_t hn, hmax; /* halfrow number */
	mpd_size_t m, r=0;
	mpd_size_t offset;
	mpd_size_t next;


	assert(cols == mul_size_t(2, rows));

	if (dir == FORWARD_CYCLE) {
	r = rows;
	}
	else if (dir == BACKWARD_CYCLE) {
	r = 2;
	}
	else {
	mpd_err_fatal("swap_halfrows_pow2_c: illegal argument for \"dir\"");
	}

	m = cols - 1;
	hmax = rows; /* cycles start at odd halfrows */
	dbits = 8 * sizeof *done;
	if ((done = mpd_calloc(hmax/(sizeof done) + 1, sizeof done)) == NULL) {
	return 0;
	}

	for (hn = 1; hn <= hmax; hn += 2) {

	if (done[hn/dbits] & mpd_bits[hn%dbits]) {
	continue;
	}

	readbuf = buf1; writebuf = buf2;

	for (offset = 0; offset < cols/2; offset += b) {

	stride = (offset + b < cols/2) ? b : cols/2-offset;

	hp = matrix + hn*cols/2;
	memcpy(readbuf, hp+offset, stride(sizeof readbuf));
	pointerswap_c(&readbuf, &writebuf);

	next = mulmod_size_t(hn, r, m);
	hp = matrix + next*cols/2;

	while (next != hn) {

	memcpy(readbuf, hp+offset, stride(sizeof readbuf));
	memcpy(hp+offset, writebuf, stride(sizeof writebuf));
	pointerswap_c(&readbuf, &writebuf);

	done[next/dbits] \|= mpd_bits[next%dbits];

	next = mulmod_size_t(next, r, m);
	hp = matrix + next*cols/2;

	}

	memcpy(hp+offset, writebuf, stride(sizeof writebuf));

	done[hn/dbits] \|= mpd_bits[hn%dbits];
	}
	}

	__mingw_dfp_get_globals()->mpd_free(done);
	return 1;
	}

	/* In-place transpose of a square matrix */
	static inline void
	squaretrans_c(uint8_t *buf, mpd_size_t cols)
	{
	uint8_t tmp;
	mpd_size_t idest, isrc;
	mpd_size_t r, c;

	for (r = 0; r < cols; r++) {
	c = r+1;
	isrc = r*cols + c;
	idest = c*cols + r;
	for (c = r+1; c < cols; c++) {
	tmp = buf[isrc];
	buf[isrc] = buf[idest];
	buf[idest] = tmp;
	isrc += 1;
	idest += cols;
	}
	}
	}

	/*
	* Transpose 2^n * 2^n matrix. For cache efficiency, the matrix is split into
	* square blocks with side length 'SIDE'. First, the blocks are transposed,
	* then a square tranposition is done on each individual block.
	*/
	static void
	squaretrans_pow2_c(uint8_t *matrix, mpd_size_t size)
	{
	uint8_t buf1[SIDE*SIDE];
	uint8_t buf2[SIDE*SIDE];
	uint8_t to, from;
	mpd_size_t b = size;
	mpd_size_t r, c;
	mpd_size_t i;

	while (b > SIDE) b >>= 1;

	for (r = 0; r < size; r += b) {

	for (c = r; c < size; c += b) {

	from = matrix + r*size + c;
	to = buf1;
	for (i = 0; i < b; i++) {
	memcpy(to, from, b(sizeof to));
	from += size;
	to += b;
	}
	squaretrans_c(buf1, b);

	if (r == c) {
	to = matrix + r*size + c;
	from = buf1;
	for (i = 0; i < b; i++) {
	memcpy(to, from, b(sizeof to));
	from += b;
	to += size;
	}
	continue;
	}
	else {
	from = matrix + c*size + r;
	to = buf2;
	for (i = 0; i < b; i++) {
	memcpy(to, from, b(sizeof to));
	from += size;
	to += b;
	}
	squaretrans_c(buf2, b);

	to = matrix + c*size + r;
	from = buf1;
	for (i = 0; i < b; i++) {
	memcpy(to, from, b(sizeof to));
	from += b;
	to += size;
	}

	to = matrix + r*size + c;
	from = buf2;
	for (i = 0; i < b; i++) {
	memcpy(to, from, b(sizeof to));
	from += b;
	to += size;
	}
	}
	}
	}

	}

	/*
	* In-place transposition of a 2^n x 2^n or a 2^n x (2*2^n)
	* or a (2*2^n) x 2^n matrix.
	*/
	static int
	transpose_pow2_c(uint8_t *matrix, mpd_size_t rows, mpd_size_t cols)
	{
	mpd_size_t size = mul_size_t(rows, cols);

	assert(ispower2(rows));
	assert(ispower2(cols));

	if (cols == rows) {
	squaretrans_pow2_c(matrix, rows);
	}
	else if (cols == mul_size_t(2, rows)) {
	if (!swap_halfrows_pow2_c(matrix, rows, cols, FORWARD_CYCLE)) {
	return 0;
	}
	squaretrans_pow2_c(matrix, rows);
	squaretrans_pow2_c(matrix+(size/2), rows);
	}
	else if (rows == mul_size_t(2, cols)) {
	squaretrans_pow2_c(matrix, cols);
	squaretrans_pow2_c(matrix+(size/2), cols);
	if (!swap_halfrows_pow2_c(matrix, cols, rows, BACKWARD_CYCLE)) {
	return 0;
	}
	}
	else {
	mpd_err_fatal("transpose_pow2_c: illegal matrix size");
	}

	return 1;
	}


	#include <time.h>
	#define UMOD_ARRAY ((mpd_uint_t)1<<28)
	#define UCHAR_ARRAY ((mpd_uint_t)1<<29)

	/* The faster in-place functions are tested against std_trans(). */
	static int
	testit_uint(void (* func)(mpd_uint_t *, mpd_size_t, mpd_size_t), mpd_size_t rows, mpd_size_t cols)
	{
	mpd_uint_t a = NULL, src = NULL, *dest = NULL;
	clock_t start_fast, end_fast, start_std, end_std;
	mpd_size_t msize;
	mpd_size_t i;
	int ret = 1;

	msize = mul_size_t(rows, cols);
	if ((a = mpd_alloc(msize, sizeof *a)) == NULL) {
	goto error;
	}
	if ((src = mpd_alloc(msize, sizeof *src)) == NULL) {
	goto error;
	}
	if ((dest = mpd_alloc(msize, sizeof *dest)) == NULL) {
	goto error;
	}

	for (i = 0; i < msize; i++) {
	a[i] = src[i] = random();
	}

	start_std= clock();
	std_trans(dest, src, rows, cols);
	end_std= clock();

	start_fast = clock();
	func(a, rows, cols);
	end_fast = clock();

	for (i = 0; i < msize; i++) {
	if (a[i] != dest[i]) {
	fprintf(stderr, "FAIL: a[%"PRI_mpd_size_t"] = %"PRI_mpd_uint_t
	"\tdest[%"PRI_mpd_size_t"] = %"PRI_mpd_uint_t"\n",
	i, a[i], i, dest[i]);
	exit(1);
	}
	}

	fprintf(stderr, "size: %10"PRI_mpd_size_t"\tstd_trans: %6.2f sec\t "
	"in_place_trans: %6.2f sec\n",
	msize,
	(double)(end_std-start_std)/(double)CLOCKS_PER_SEC,
	(double)(end_fast-start_fast)/(double)CLOCKS_PER_SEC);


	out:
	if (a) __mingw_dfp_get_globals()->mpd_free(a);
	if (src) __mingw_dfp_get_globals()->mpd_free(src);
	if (dest) __mingw_dfp_get_globals()->mpd_free(dest);
	return ret;

	error:
	ret = 0;
	goto out;
	}

	/* The faster in-place functions are tested against std_trans(). */
	static int
	testit_uchar(void (* func)(uint8_t *, mpd_size_t, mpd_size_t), mpd_size_t rows, mpd_size_t cols)
	{
	uint8_t a = NULL, src = NULL, *dest = NULL;
	clock_t start_fast, end_fast, start_std, end_std;
	mpd_size_t msize;
	mpd_size_t i;
	int ret = 1;

	msize = mul_size_t(rows, cols);
	if ((a = mpd_alloc(msize, sizeof *a)) == NULL) {
	goto error;
	}
	if ((src = mpd_alloc(msize, sizeof *src)) == NULL) {
	goto error;
	}
	if ((dest = mpd_alloc(msize, sizeof *dest)) == NULL) {
	goto error;
	}

	for (i = 0; i < msize; i++) {
	a[i] = src[i] = random();
	}

	start_std= clock();
	std_trans_c(dest, src, rows, cols);
	end_std= clock();

	start_fast = clock();
	func(a, rows, cols);
	end_fast = clock();

	for (i = 0; i < msize; i++) {
	if (a[i] != dest[i]) {
	fprintf(stderr, "FAIL: a[%"PRI_mpd_size_t"] = %d\t"
	"dest[%"PRI_mpd_size_t"] = %d\n",
	i, a[i], i, dest[i]);
	exit(1);
	}
	}

	fprintf(stderr, "size: %10"PRI_mpd_size_t"\tstd_trans: %6.2f sec\t "
	"in_place_trans: %6.2f sec\n",
	msize,
	(double)(end_std-start_std)/(double)CLOCKS_PER_SEC,
	(double)(end_fast-start_fast)/(double)CLOCKS_PER_SEC);


	out:
	if (a) __mingw_dfp_get_globals()->mpd_free(a);
	if (src) __mingw_dfp_get_globals()->mpd_free(src);
	if (dest) __mingw_dfp_get_globals()->mpd_free(dest);
	return ret;

	error:
	ret = 0;
	goto out;
	}


	#define transfunc_uint (void ()(mpd_uint_t , mpd_size_t, mpd_size_t))
	#define transfunc_uchar (void ()(uint8_t , mpd_size_t, mpd_size_t))


	int main(void)
	{
	mpd_size_t rows, cols;


	fprintf(stderr, "Running test_transpose ... \n");

	fprintf(stderr, "\n2^n * 2^n mpd_uint_t matrices:\n\n");
	for (rows = 1; mul_size_t(rows, rows) <= UMOD_ARRAY; rows *= 2) {
	if (!testit_uint(transfunc_uint transpose_pow2, rows, rows)) {
	break;
	}
	}

	fprintf(stderr, "\n2^n * 2*2^n mpd_uint_t matrices:\n\n");
	for (rows = 8, cols = 16; mul_size_t(rows, cols) <= UMOD_ARRAY; rows = 2, cols=2) {
	if (!testit_uint(transfunc_uint transpose_pow2, rows, cols)) {
	break;
	}
	}

	fprintf(stderr, "\n22^n 2^n mpd_uint_t matrices:\n\n");
	for (rows = 16, cols = 8; mul_size_t(rows, cols) <= UMOD_ARRAY; rows = 2, cols=2) {
	if (!testit_uint(transfunc_uint transpose_pow2, rows, cols)) {
	break;
	}
	}


	fprintf(stderr, "\n2^n * 2^n uint8_t matrices:\n\n");
	for (rows = 1; mul_size_t(rows, rows) <= UCHAR_ARRAY; rows *= 2) {
	if (!testit_uchar(transfunc_uchar transpose_pow2_c, rows, rows)) {
	break;
	}
	}

	fprintf(stderr, "\n2^n * 2*2^n uint8_t matrices:\n\n");
	for (rows = 8, cols = 16; mul_size_t(rows, cols) <= UCHAR_ARRAY; rows = 2, cols=2) {
	if (!testit_uchar(transfunc_uchar transpose_pow2_c, rows, cols)) {
	break;
	}
	}

	fprintf(stderr, "\n22^n 2^n uint8_t matrices:\n\n");
	for (rows = 16, cols = 8; mul_size_t(rows, cols) <= UCHAR_ARRAY; rows = 2, cols=2) {
	if (!testit_uchar(transfunc_uchar transpose_pow2_c, rows, cols)) {
	break;
	}
	}

	fprintf(stderr, "\ntest_transpose: PASS\n\n");

	return 0;
	}