/*
 * Copyright 2002 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident "%W% %E% SMI"

/*
 * This file contains test loops for stressing the Level 2 data cache.
 * It currently contains the following :
 * 1. Hit Loops
 *	1.1 Read hits
 *		Write (first write to line miss, followed by hits in line)
 *		Read in increments of line size(all hits to L2)
 *		Write in increments of line size(all hits to L2)
 *	1.2 Write hits
 *		Write the complete buffer of size equal to cache size. Repeat
 *		This is done for ulong and double data types.
 *	1.3 50% read hits
 *		Read in increments of line size / 2. First read will be a miss
 *		followed by a hit.
 *	1.4 Derived data type hits
 *		Write and Write-Read-Verify loops with a derived datatype
 *		This accesses the L2 cache with different allignments.
 *
 * 2. Miss Loops
 *	2.1. Write miss loop
 *	2.2. Read miss loop
 *	2.3. Write miss read loop
 *	2.4. Read miss write loop
 *
 *
 * 3. Set Miss Loops  : Access all lines for the set before accessing next
 *	set. Operations tested are :
 *	  3.1. Write set miss loop
 *	  3.2. Read set miss loop
 *	  3.3. Write miss Read loop
 *	  3.4. Read miss Write loop
 *
 *
 * 4. Random offset loops : Test loops which access a buffer at random
 *    locations. The operations for random access loop are :
 *	 4.1 Write
 *	 4.2 Write-Read-Verify
 *	 4.3 Read
 *
 * TBD :
 *	basic_dtype_l2cachehit_stress : datatypes char, short, int
 *
 */

#include	<sys/types.h>
#include	<sys/errno.h>
#include	<sys/time.h>
#include	<signal.h>
#include	<stdlib.h>
#include	<sys/mman.h>
#include	<unistd.h>
#include	<stdio.h>
#include	<testinfo.h>
#include	<note.h>
#include	"l2_util.h"
#include	"l2cache.h"

#define	L2_CACHE_TEST_NAME	"l2cache"

/*
 * Defination for derived data type for derived data type tests
 */
#pragma pack(1)
typedef struct derived_dtype {
	char	char_elem;
	short	short_elem;
	int	int_elem;
	ulong_t	long_elem;
} derived_dtype_s;
#pragma pack()
typedef derived_dtype_s *derived_dtype_t;

/* Pagesize for this system */
static size_t pagesize;

static int basic_dtype_l2cachehit_stress(size_t bufsize,
    int number_loops);
static int l2cachehit_wrrdwr(size_t bufsize, size_t line_size,
    int num_loops);
static int derived_dtype_write_read_verify(derived_dtype_t buf,
    size_t bufsize, int num_loops, derived_dtype_t val);
static int derived_dtype_write(derived_dtype_t buf,
    size_t bufsize, int num_loops, derived_dtype_t val);
static int cache_writemiss(ulong_t *buf, size_t bufsize,
    size_t row_size, int num_loops, ulong_t write_val);
static int cache_readmiss(ulong_t *buf, size_t bufsize,
    size_t row_size, int num_loops);
static int cache_readmiss_write(ulong_t *buf, size_t bufsize,
    size_t row_size, int num_loops, ulong_t write_val);
static int cache_writemiss_read(ulong_t *buf, size_t bufsize,
    size_t row_size, int num_loops, ulong_t write_val);
static int cache_sets_trash(ulong_t *buf, size_t cache_size,
    size_t line_size, int set_associativity, int num_loops,
    ulong_t write_val, int operation);
static int rand_off_wr_loop(ulong_t *buf, size_t bufsize,
    size_t numloops, ulong_t write_val);
static int rand_off_wrrdvrfy_loop(ulong_t *buf, size_t bufsize,
    int numloops, ulong_t write_val);
static int rand_off_rd_loop(ulong_t *buf, size_t bufsize,
    int numloops);
static	int cache_march_stress1(size_t cache_size,
    size_t line_size, int num_loops);
static  int cache_march_stress2(size_t cache_size, int num_loops);

/*
 * Optimization for all test loops to be minimal.
 */
#pragma	opt 1	(basic_dtype_l2cachehit_stress, l2cachehit_wrrdwr)
#pragma	opt 1	(derived_dtype_write_read_verify, derived_dtype_write)
#pragma	opt 1	(cache_writemiss, cache_readmiss, cache_writemiss_read)
#pragma	opt 1	(cache_sets_trash, rand_off_wr_loop)
#pragma	opt 1	(rand_off_wrrdvrfy_loop, rand_off_rd_loop)
#pragma	opt 1	(cache_march_stress1, cache_march_stress2)

/* flags to differentiate the cache sets trash loops */
#define		LOOP_BASE	0
#define		WR_MISS		((LOOP_BASE)+1)
#define		RD_MISS		((LOOP_BASE)+2)
#define		WR_MISS_RD	((LOOP_BASE)+3)
#define		RD_MISS_WR	((LOOP_BASE)+4)

/* Cache random swapping function */
extern  int crand(ulong_t);
extern  int cache_march_stress3(size_t, size_t, int);

#ifdef L2CACHE_PERF_MONITOR
/*
 * Processor event string for DC reads and write statistics
 * This strings are supported on Ultra Sparc-I,II,III
 * TBD : Initialize these strings dynamically, based on the
 * processor type.
 */
static char *l2wr_perf_string;
static char *l2rd_perf_string;

/* Event string for display only */
static char *l2_pic0_name;
static char *l2_pic1_name;

/*
 * l2cache_cpc_valid - Set if CPC is valid for this processor
 * l2cache_perf_valid- Set if cpc_setup and  get_sample for
 *		before was obtained correctly
 * l2cache_cpc_ver - CPU CPC version
 */
static int l2cache_cpc_valid = 0;
static int l2cache_perf_valid;
static int l2cache_cpc_ver;

#define	L2CACHE_CPC_SETUP(perf_string)	\
	if (l2cache_cpc_valid) {	\
		l2cache_perf_valid =	\
		    (l2cache_cpc_setup((perf_string),	\
		    l2cache_cpc_ver) != -1);	\
		if (l2cache_perf_valid &&	\
		    cpc_take_sample(&before) == -1) {	\
			l2cache_perf_valid = 0;	\
			vts_message(	\
			    NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,	\
			    "In cpc_take_sample(before)\n");	\
		}	\
	}

#define	L2CACHE_CPC_AFTER(msg_prefix)	\
	if (l2cache_cpc_valid && l2cache_perf_valid) {	\
		if (cpc_take_sample(&after) == -1) {	\
			vts_message(	\
			    NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,	\
			    "In cpc_take_sample(after)\n");	\
		} else {	\
			cpc_event_diff(&diff, &after, &before);	\
			diff.ce_pic[1] = after.ce_pic[1] -	\
				before.ce_pic[1];	\
			diff.ce_hrt = after.ce_hrt - before.ce_hrt;	\
			vts_message(	\
			    NO_EXIT, VERBOSE, L2_CACHE_TEST_NAME, 0,	\
			    "L2 cache %s%s %llu, %s %llu\n",	\
			    (msg_prefix), l2_pic0_name,	\
			    diff.ce_pic[0], l2_pic1_name,	\
			    diff.ce_pic[1]);	\
		}	\
	}
#else
#define	L2CACHE_CPC_SETUP(perf_string)
#define	L2CACHE_CPC_AFTER(msg_prefix)
#endif	/* L2CACHE_PERF_MONITOR */


/*
 * L2 cache hit loops
 * --------------------
 * L2 Cache Write/Write-Read-Verify cache hit test loops.
 * These test loops are common for all basic data types.  These test
 * loops continue for the number of test loops specified as an
 * argument.
 *
 * The general format of test loop are as below :
 *	for num_loops do
 *		Continue till buffer limit is reached
 *			<Test-operation>
 *
 * These defines SHOULD be used as inline code. The following
 * SHOULD be passed as parameters :
 *	1. datatype	- Any of the basic datatype
 *	2. datatype	*buf
 *	4. int		ret_val : Return Value
 *	5. datatype	val : Value to write / Read verify
 *	6. int		num_loops
 *
 * ASSUMPTION : Only called once in a function for a datatype.
 *              Multiple calls WILL result in compile errors
 *
 * Test Loops
 *		Write
 *		Write-Read-Verify
 */

#define	l2cachehit_write(datatype, buf, ret_val,	\
    val, num_loops) \
	{ \
		int limit = sizeof (buf) / sizeof (##datatype);	\
		int direction = 1;	\
##datatype *localbuf = (buf);	\
		int cnt = num_loops;	\
		while (cnt--) { \
			int index = limit;	\
			while (index--) {	\
				*localbuf = (val);	\
				localbuf += direction;	\
			}	\
			direction *= -1;	\
		}	\
		(ret_val) = 0;	\
	}

#define	l2cachehit_write_read_verify(datatype, buf, \
    ret_val, val, num_loops) \
	{ \
##datatype	*localbuf = (buf);	\
		int direction = 1;	\
		int limit = sizeof (buf) / sizeof (##datatype);	\
		int cnt = num_loops;	\
		while (cnt--) { \
			int index = limit;	\
			while (index--) {	\
					*localbuf = val;	\
					if (*localbuf != val) {	\
						(ret_val) = -1;	\
						goto wr_rd_vrfy_##datatype; \
				}	\
				localbuf += direction;	\
			}	\
			direction *= -1;	\
		}	\
		ret_val = 0; \
wr_rd_vrfy_##datatype:	\
		;	\
	}

/*
 * Perform hit test loops for all REQUIRED basic data types,
 * for the specified buffer size.
 *
 * Currently this supports ulong and double
 */
static int
basic_dtype_l2cachehit_stress(size_t bufsize, int number_loops)
{
	void	*buf;
	ulong_t	*ulong_buf;
	ulong_t	write_ulong = (ulong_t)lrand48();
	double	*double_buf;
	double	write_double = drand48();
	int	rc = 0;
	int lock_status = 0;

	/* Allocate and lock a buffer for all loops */
	buf = (ulong_t *)memalign(pagesize, (size_t)bufsize);
	if (buf == NULL) {
		return (ENOMEM);
	}
	lock_status = mlock((void *)buf, bufsize);
	if (lock_status) {
		/* Log an error message(Notice) that locking failed */
		vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
		    "basic_dtype, ulong mlock failed!!\n");
	}
	ulong_buf = (ulong_t *)buf;
	double_buf = (double *)buf;

	/* Do basic data type test  loops for ulong_t */
	l2cachehit_write(ulong_t, ulong_buf, rc,
	    write_ulong, number_loops);
	if (rc)
		goto basic_dtype_exit;

	write_ulong *= 2;	/* Change write_ulong */

	l2cachehit_write_read_verify(ulong_t, ulong_buf,
	    rc, write_ulong, number_loops);
	if (rc)
		goto basic_dtype_exit;

	/* Do basic data type test  loops for double */
	l2cachehit_write(double, double_buf,
	    rc, write_double, number_loops);
	if (rc)
		goto basic_dtype_exit;

	write_double *= 2;
	l2cachehit_write_read_verify(double, double_buf,
	    rc, write_double, number_loops);
	if (rc)
		goto basic_dtype_exit;

basic_dtype_exit:
	/* unlock the memory */
	if (lock_status) {
		lock_status = munlock((void *)double_buf, bufsize);
		if (lock_status) {
			/* Log an Fatal error? message */
			vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME,
			    0, "basic_dtype munlock failure!!\n");
		}
	}

	free(double_buf);
	return (rc);
}

/*
 * This is a Read miss loop followed by  Read hit loop and Write hit
 * loop. This is currently supported only for ulong_t datatype. There
 * apparently does not seem to be any value add by extending this for
 * other datatypes.
 */
static int
l2cachehit_wrrdwr(size_t bufsize, size_t line_size, int num_loops)
{
	ulong_t	write_val;
	ulong_t	*buf;
	int	lock_status = 0;
	ulong_t	limit;
	int	loop_cnt;
	ulong_t	i;

	write_val = (ulong_t)lrand48();
	limit = bufsize / sizeof (ulong_t);
	loop_cnt = num_loops;

	/* Allocate and lock the buffer for test loop */
	buf = (ulong_t *)memalign((size_t)pagesize,
	    (size_t)bufsize);
	if (buf == NULL) {
		return (ENOMEM);
	}
	lock_status = mlock((void *)buf, bufsize);
	if (lock_status) {
		/* Log an error message(Notice) that locking failed */
		vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
		    "l2cachehit_wrrdwr, mlock failed!!\n");
	}

	/*
	 * 1. Fill up the buffer by writes, First write in line will be a miss,
	 *    the rest will be hits. As Dcache is not write allocate, all the
	 *    data will be directly written to L2 cache.
	 * 2. Read in increments of Ecache line size, This will be hits to L2,
	 *    as the writes have filled in the buffer.
	 * 3. Write in increments of line_size. This will also be hits.
	 */
	for (i = 0; i < limit; i++) {
		buf[i] = write_val;
	}
	while (loop_cnt--) {
		for (i = 0; i < limit; i += line_size) {
			write_val += buf[i];
		}
	}
	loop_cnt = num_loops;
	while (loop_cnt--) {
		for (i = 0; i < limit; i += line_size) {
			buf[i] = write_val;
		}
	}

	/* unlock the memory */
	if (lock_status) {
		lock_status = munlock((void *)buf, bufsize);
		if (lock_status) {
			/* Log an Fatal error? message */
			vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME,
			    0, "l2cachehit_wrrdwr: munlock failed!!\n");
		}
	}
	free(buf);

	return (0);
}

/*
 * Derived datatype tests. This contains Write & Write-Read-Verify
 */
static int
derived_dtype_write_read_verify(derived_dtype_t buf, size_t bufsize,
    int num_loops, derived_dtype_t val)
{
	ulong_t limit = bufsize / sizeof (derived_dtype_s);
	int direction = 1;

	while (num_loops--) {
		ulong_t index = limit;

		while (index--) {
			buf->char_elem = val->char_elem;
			buf->short_elem = val->short_elem;
			buf->int_elem = val->int_elem;
			buf->long_elem = val->long_elem;

			if (buf->char_elem != val->char_elem ||
			    buf->short_elem != val->short_elem ||
			    buf->int_elem != val->int_elem ||
			    buf->long_elem != val->long_elem) {
				return (-1);
			}
			if (index && direction == 1)
				buf++;
			else if (index && direction == 0)
				buf--;
		}
		if (direction)
			direction = 0;
		else
			direction = 1;
	}
	return (0);
}

static int
derived_dtype_write(derived_dtype_t buf, size_t bufsize, int num_loops,
    derived_dtype_t val)
{
	ulong_t limit = bufsize / sizeof (derived_dtype_s);
	int direction = 1;

	while (num_loops--) {
		ulong_t index = limit;

		while (index--) {
			buf->char_elem = val->char_elem;
			buf->short_elem = val->short_elem;
			buf->int_elem = val->int_elem;
			buf->long_elem = val->long_elem;

			if (index && direction == 1)
				buf++;
			else if (index && direction == 0)
				buf--;
		}
		if (direction)
			direction = 0;
		else
			direction = 1;
	}
	return (0);
}

static int
derived_dtype_test(size_t cache_size, int num_loops)
{
	derived_dtype_t buf;
	derived_dtype_s val;
	int lock_status;
	int rc = 0;

	/* Allocate and lock buffer for test loop */
	buf = (derived_dtype_t)memalign((size_t)pagesize,
	    (size_t)cache_size);
	if (buf == NULL) {
	    return (ENOMEM);
	}
	lock_status = mlock((void *)buf, cache_size);
	if (lock_status) {
		/* Log an error message(Notice) that locking failed */
		vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
		    "derived_dtype mlock failed!!\n");
	}

	if ((rc = derived_dtype_write(buf, cache_size, num_loops,
	    &val)) != 0) {
		goto derived_test_exit;
	}
	if ((rc = derived_dtype_write_read_verify(buf, cache_size, num_loops,
	    &val)) != 0) {
		goto derived_test_exit;
	}

derived_test_exit:
	if (lock_status) {
		/* unlock the memory */
		lock_status = munlock((void *)buf, cache_size);
		if (lock_status) {
			/* Log an Fatal error? message */
			vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
			    "derived_dtype munlock failed!!\n");
		}
	}
	free(buf);

	return (rc);
}

static int cache_readmiss(ulong_t *buf, size_t bufsize,
    size_t row_size, int num_loops);

/*
 * The main cache hit stress function.
 *
 * The line size and cache size are passed to this.
 */
static int
cache_hit_stress(size_t cache_size, size_t line_size, int num_loops)
{
	int rval = 0;
	ulong_t *buf;
	int lock_status;
	int bufsize;

	/*
	 * Do test loop(s) for combined data types. This loops may give cache
	 * access rates lower because of the some compare loops.
	 */
	if ((rval = basic_dtype_l2cachehit_stress(cache_size, num_loops))
	    != 0) {
		if (rval != ENOMEM)
			return (rval);
	}

	/*
	 * Do the test loop for write-read-Write hits - TBD
	 */
	if ((rval = l2cachehit_wrrdwr(cache_size, line_size, num_loops))
	    != 0) {
		if (rval != ENOMEM)
			return (rval);
	}

	/* Do the derived data type hit loop */
	if ((rval = derived_dtype_test(cache_size, num_loops)) != 0) {
		if (rval != ENOMEM)
			return (rval);
	}

	/*
	 * Read miss loop with increment of linesize / 2, this shall
	 * cause 50% hit loop.
	 */
	bufsize = cache_size;
	buf = (ulong_t *)memalign((size_t)pagesize,
	    (size_t)bufsize);
	if (buf == NULL) {
		return (0);
	}
	lock_status = mlock((void *)buf, bufsize);
	if (lock_status) {
		vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
		    "cache_miss mlock failed!!\n");
	}

	rval = cache_readmiss(buf, bufsize, line_size / 2, num_loops);

	/* unlock(if lock was successful) and free the memory */
	if (lock_status) {
		lock_status = munlock((void *)buf, bufsize);
		if (lock_status) {
			/* Log an Fatal error? message */
			vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
			    "cache_miss munlock failed!!\n");
		}
	}
	free(buf);

	if (rval != ENOMEM)
		return (rval);
	else
		return (0);
}

/*
 * Cache write miss loops.
 *
 * These loops take row_size as a parameter. This can be either line size
 * or multiple of it. Accessing a buffer in incremnts of line size causes
 * a miss. Accessing buffer with a different increment causes miss in a
 * different manner.
 */
static int
cache_writemiss(ulong_t *buf, size_t bufsize, size_t row_size, int num_loops,
    ulong_t write_val)
{
	ulong_t	*ptr0;
	ulong_t	num_row, num_clmn;	/* Number of Rows and Columns */
	ulong_t	i, j;	/* Loop variables */

	num_clmn = row_size / sizeof (ulong_t);
	num_row = bufsize / row_size;

	while (num_loops--) {
		for (j = 0; j < num_clmn; j++) {
			ptr0 = buf + j;
			for (i = 0; i < num_row; i++) {
				*ptr0 = write_val;
				ptr0 += num_clmn;
			}
		}
		write_val *= 2;	/* Change the write value */
	}

	return (0);
}

static int
cache_readmiss(ulong_t *buf, size_t bufsize, size_t row_size, int num_loops)
{
	volatile ulong_t *ptr0;
	ulong_t	num_row, num_clmn;	/* Number of Rows and Columns */
	ulong_t	i, j;	/* Loop variables */
	volatile ulong_t sum;

	sum = (ulong_t)lrand48();

	num_clmn = row_size / sizeof (ulong_t);
	num_row = bufsize / row_size;

	while (num_loops--) {
		for (j = 0; j < num_clmn; j++) {
			ptr0 = buf + j;
			for (i = 0; i < num_row; i++) {
				sum += *ptr0;
				ptr0 += num_clmn;
			}
		}
	}
	*buf = sum;

	return (0);
}

static int
cache_readmiss_write(ulong_t *buf, size_t bufsize, size_t row_size,
    int num_loops, ulong_t write_val)
{
	volatile ulong_t	*ptr0;
	ulong_t	num_row, num_clmn;	/* Number of Rows and Columns */
	ulong_t	i, j;	/* Loop variables */

	num_clmn = row_size / sizeof (ulong_t);
	num_row = bufsize / row_size;

	while (num_loops--) {
		for (j = 0; j < num_clmn; j++) {
			ptr0 = buf + j;
			for (i = 0; i < num_row; i++) {
				*ptr0 *= write_val;
				ptr0 += num_clmn;
			}
		}
	}

	return (0);
}

static int
cache_writemiss_read(ulong_t *buf, size_t bufsize, size_t row_size,
    int num_loops, ulong_t write_val)
{
	ulong_t	*ptr0;
	ulong_t	num_row, num_clmn;	/* Number of Rows and Columns */
	ulong_t	i, j;	/* Loop variables */
	volatile ulong_t sum;

	sum = (ulong_t) lrand48(); 	/* Init sum for lint */
	num_clmn = row_size / sizeof (ulong_t);
	num_row = bufsize / row_size;

	while (num_loops--) {
		for (j = 0; j < num_clmn; j++) {
			ptr0 = buf + j;
			for (i = 0; i < num_row; i++) {
				*ptr0 = write_val;
				/* Change the write value */
				write_val *= 2;
				sum += *ptr0;
				ptr0 += num_clmn;
			}
		}
	}
	*buf = sum;	/* Use it to avoid optimization */

	return (0);
}

static int
cache_sets_trash(ulong_t *buf, size_t cache_size, size_t line_size,
    int set_associativity, int num_loops, ulong_t write_val,
    int operation)
{

	size_t num_of_sets;
	size_t jump_by_line;
	size_t jump_by_set_next_line;
	ulong_t *ptr, *buf_line0;
	volatile ulong_t sum = 0;
	int i, j, count;

	buf_line0 = buf;
	num_of_sets = (cache_size / (line_size * set_associativity));
	jump_by_line = line_size / (size_t)sizeof (ulong_t);
	/* Jump next line in the same set */
	jump_by_set_next_line = (line_size * num_of_sets) /
	    (size_t)sizeof (ulong_t);

	/*
	 * The inner loops (while and for) will trash all lines
	 * in the same set for for count nymber of times
	 */
	for (i = 0; i < num_of_sets; i++) {
	    count  = num_loops;
	    while (count--) {
			ptr = buf_line0;
			for (j = 0; j < set_associativity; j++) {
				switch (operation) {
					case WR_MISS:
						*ptr = write_val;
						ptr += jump_by_set_next_line;
						/* Change write value */
						write_val *= 2;
						break;

					case RD_MISS:
						sum += *ptr;
						ptr += jump_by_set_next_line;
						break;

					case WR_MISS_RD:
						*ptr = write_val;
						/* Change write value */
						write_val *= 2;
						sum = *ptr;
						ptr += jump_by_set_next_line;
						break;

					case RD_MISS_WR:
						(*ptr) *= write_val;
						ptr += jump_by_set_next_line;
						break;
				}
			}
	    }
		buf_line0 += jump_by_line;
	}
	return (0);
}


/*
 * The main cache miss stress  function. This contains the following test loops
 * 1. Matrix with row size equal to line size
 *
 * The test loop is as below :
 *	ulong_t *ptr0 = buf of size  bufsize = 2 * cache_size
 *	num_clmn = row_size / sizeof(ulong)
 *	num_row = bufsize / rowsize
 *	for (j = 0; j < num_clmn; j++)
 *		ptr0 = buf + j;
 *		for (i = 0; i < num_row; i++)
 *			<OPERATE> *buf
			buf += num_clmn
 *
 * The inner loop jumps in terms of rows. This will cause an element of the
 * matrix with offset = row_size(either line size or cache_size), thus
 * causing a miss.
 *
 * The buffer size is set to 2 times cache size because :
 *	1. A end of inner loop the first part of the buffer(size = cachesize)
 *	   is out of the cache. The access to the start of the buffer will be
 *	   a miss.
 *	2. The same loop can be repeated N times causing same miss pattern. If
 *	   the buffer size is equal to cache size, the second iteration of the
 *	   loop will be a hit, as nothing else has replaced it(Assuming
 *	   standalone operation of the test loop.
 *	3. Assuming that the number of loops is a multiple of two, the same
 *	   buffer can be used to test different operations(write, read, etc.).
 *
 * Operations supported : Write miss, Read miss, Read miss followed by write &
 *	Write miss followed by read.
 */
static int
cache_miss_stress(size_t cache_size, size_t line_size,
	int set_associativity, int num_loops)
{
	ulong_t *buf;
	int bufsize = cache_size * 2;
	int lock_status;
	ulong_t write_value;
	int rc = 0;
#ifdef L2CACHE_PERF_MONITOR
	cpc_event_t before, after, diff;
#endif	/* L2CACHE_PERF_MONITOR */

	/* Alloc and lock the memory */
	buf = (ulong_t *)memalign((size_t)pagesize,
	    (size_t)bufsize);
	if (buf == NULL) {
		return (ENOMEM);
	}
	lock_status = mlock((void *)buf, bufsize);
	if (lock_status) {
		vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
		    "cache_miss mlock failed!!\n");
	}

	write_value = (ulong_t)lrand48();

	/*
	 * Line size miss
	 */
	L2CACHE_CPC_SETUP(l2wr_perf_string);
	if ((rc = cache_writemiss(buf, bufsize, line_size, num_loops,
	    write_value)) != 0) {
		goto cache_miss_exit;
	}
	L2CACHE_CPC_AFTER("Write miss loop : ");

	L2CACHE_CPC_SETUP(l2rd_perf_string);
	if ((rc = cache_readmiss(buf, bufsize, line_size, num_loops))
	    != 0) {
		goto cache_miss_exit;
	}
	L2CACHE_CPC_AFTER("Read miss loop : ");

	L2CACHE_CPC_SETUP(l2rd_perf_string);
	if ((rc = cache_readmiss_write(buf, bufsize, line_size, num_loops,
	    write_value)) != 0) {
		goto cache_miss_exit;
	}
	L2CACHE_CPC_AFTER("Read miss write loop : ");

	L2CACHE_CPC_SETUP(l2wr_perf_string);
	if ((rc = cache_writemiss_read(buf, bufsize, line_size, num_loops,
	    write_value)) != 0) {
		goto cache_miss_exit;
	}
	L2CACHE_CPC_AFTER("Write miss read loop : ");


	/*
	 * Trash all the sets in a cache
	 * If set associativity is greater to 1, then trash
	 * all the sets in cache. Otherwise its a direct-mapped
	 * cache in which each line size is equivalent to one set
	 * for which we have loops above.
	 */
	if (set_associativity <= 1) {
		return (rc);
	}

	L2CACHE_CPC_SETUP(l2wr_perf_string);
	if ((rc = cache_sets_trash(buf, bufsize, line_size,
	    set_associativity, num_loops, write_value,
	    WR_MISS)) != 0) {
		goto cache_miss_exit;
	}
	L2CACHE_CPC_AFTER("sets write miss loop : ");

	L2CACHE_CPC_SETUP(l2rd_perf_string);
	if ((rc = cache_sets_trash(buf, bufsize, line_size,
	    set_associativity, num_loops, write_value,
	    RD_MISS)) != 0) {
		goto cache_miss_exit;
	}
	L2CACHE_CPC_AFTER("sets read miss loop : ");

	L2CACHE_CPC_SETUP(l2wr_perf_string);
	if ((rc = cache_sets_trash(buf, bufsize, line_size,
	    set_associativity, num_loops, write_value,
	    WR_MISS_RD)) != 0) {
		goto cache_miss_exit;
	}
	L2CACHE_CPC_AFTER("sets write miss read loop : ");

	L2CACHE_CPC_SETUP(l2rd_perf_string);
	if ((rc = cache_sets_trash(buf, bufsize, line_size,
	    set_associativity, num_loops, write_value,
	    RD_MISS_WR)) != 0) {
		goto cache_miss_exit;
	}
	L2CACHE_CPC_AFTER("sets read miss write loop : ");



cache_miss_exit:
	/* unlock(if lock was successful) and free the memory */
	if (lock_status) {
		lock_status = munlock((void *)buf, bufsize);
		if (lock_status) {
			/* Log an Fatal error? message */
			vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
			    "cache_miss munlock failed!!\n");
		}
	}
	free(buf);

	return (rc);
}

/*
 * Random offset access test loops:
 *		These test loops exercise the cache in a random manner,
 *		causing cache hits and misses in an random manner.
 *
 *		Currently writeonly, readonly and wr-read-verify loops
 *		are supported. Write followed by Read verify cannot be
 *		tested as write at random locations does NOT guarantee
 *		that the complete buffer is written. The read-verify
 *		at a location which was not written WILL fail!!!
 *		Read verify can be done with fully written buffer.
 */
static int
rand_off_wr_loop(ulong_t *buf, size_t bufsize, size_t numloops,
	ulong_t write_val)
{
	ulong_t *rand_buf;
	size_t	rand_bufsz, i;

	/*
	 * Fill the random buffer with random offsets. The offsets
	 * are limited by ]0 - (bufsize -1)].
	 */
	rand_buf = l2_get_randbuf(bufsize, &rand_bufsz);
	if (rand_buf == NULL) {
		return (ENOMEM);
	}

	while (numloops--) {
		for (i = 0; i < rand_bufsz; i++) {
			buf[rand_buf[i]] = write_val;
		}
		write_val *= 2;
	}
	free(rand_buf);
	return (0);
}

static int
rand_off_wrrdvrfy_loop(ulong_t *buf, size_t bufsize, int numloops,
	ulong_t write_val)
{
	ulong_t *rand_buf;
	size_t	rand_bufsz, i;

	/*
	 * Fill the random buffer with random offsets. The offsets
	 * are limited by ]0 - (bufsize -1)].
	 */
	rand_buf = l2_get_randbuf(bufsize, &rand_bufsz);
	if (rand_buf == NULL) {
		return (ENOMEM);
	}

	while (numloops--) {
		for (i = 0; i < rand_bufsz; i++) {
			buf[rand_buf[i]] = write_val;
			if (buf[rand_buf[i]] != write_val) {
				return (-1);
			}
		}
		write_val *= 2;
	}
	free(rand_buf);
	return (0);
}

static int
rand_off_rd_loop(ulong_t *buf, size_t bufsize, int numloops)
{
	ulong_t *rand_buf;
	size_t	rand_bufsz, i;
	volatile ulong_t read_sum;

	/*
	 * Fill the random buffer with random offsets. The offsets
	 * are limited by ]0 - (bufsize -1)].
	 */
	rand_buf = l2_get_randbuf(bufsize, &rand_bufsz);
	if (rand_buf == NULL) {
		return (ENOMEM);
	}
	read_sum = rand_bufsz;

	while (numloops--) {
		for (i = 0; i < rand_bufsz; i++) {
			read_sum += buf[rand_buf[i]];
		}
	}
	free(rand_buf);
	return (0);
}

static int
cache_rand_off_stress(ulong_t *buf, size_t bufsize, int numloops)
{
	int rval;
	ulong_t write_val = (ulong_t)lrand48();

	bufsize /= sizeof (ulong_t);
	if ((rval = rand_off_wr_loop(buf, bufsize, numloops,
	    write_val)) != 0) {
		return (rval);
	}

	write_val = (ulong_t)lrand48();
	if ((rval = rand_off_wrrdvrfy_loop(buf, bufsize, numloops,
	    write_val)) != 0) {
		return (rval);
	}

	if ((rval = rand_off_rd_loop(buf, bufsize, numloops))
	    != 0) {
		return (rval);
	}

	return (0);
}

/*
 * March test to flush out the stuck at and coupling failures
 * in the Ecache.
 */
static	int
cache_march_stress1(size_t cache_size, size_t line_size, int num_loops)
{
	ulong_t	*buf;
	int bufsize	= cache_size;
	int	ret = 0;
	int	lock_status = 0;

	ulong_t		pattern, cpattern;
	ulong_t 	*ptr0;
	ulong_t 	num_row, num_clmn; /* Num. of rows and columns */
	ulong_t 	i, j, sum, patnum;   /* Loop variables */
	int		arr_size;

	ulong_t	pats[] = {
		0xFFFFFFFFFFFFFFFFL, 0xAAAAAAAAAAAAAAAAL,
		0xfefefefefefefefeL, 0xfdfdfdfdfdfdfdfdL,
		0xfbfbfbfbfbfbfbfbL, 0xf7f7f7f7f7f7f7f7L,
		0xefefefefefefefefL, 0xdfdfdfdfdfdfdfdfL,
		0x7f7f7f7f7f7f7f7fL, 0xf0f0f0f0f0f0f0f0L,
		0x3333333333333333L, 0xccccccccccccccccL
	};

	arr_size = sizeof (pats) / sizeof (ulong_t);
	vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
	    "L2 March test with row striding\n");

	vts_message(0, VERBOSE, NULL, MARCH_TEST_ROW_STRIDE_START);

	/* Alloc and lock the memory */
	buf = (ulong_t *)memalign((size_t)pagesize, (size_t)bufsize);
	if (buf == NULL) {
		return (ENOMEM);
	}
	lock_status = mlock((void *)buf, bufsize);
	num_clmn = line_size / sizeof (ulong_t);
	num_row = bufsize / line_size;

	while (num_loops--) {
		patnum = l2_getrandom(arr_size + 1);
		if (patnum < arr_size) {
			pattern = pats[patnum];
			cpattern = ~pattern;
		} else {
			pattern = (ulong_t)lrand48();
			cpattern = ~pattern;
		}

		/* Doing March test with word size patterns */
		/* Step 1: Write to the entire E$ one word at a time */
		for (j = 0; j < num_clmn; j++) {
			ptr0 = buf + j;
			for (i = 0; i < num_row; i++) {
				*ptr0 = pattern;
				ptr0 += num_clmn;
			}
		}


		/* Step 2: Read the entire E$ one word at a time */
		for (j = 0; j < num_clmn; j++) {
			ptr0 = buf + j;
			for (i = 0; i < num_row; i++) {
				sum = *ptr0;
				ptr0 += num_clmn;
			}
		}

		/* Step 3: Write pattern compliment to the entire E$ */
		for (j = 0; j < num_clmn; j++) {
			ptr0 = buf + j;
			for (i = 0; i < num_row; i++) {
				*ptr0 = cpattern;
				ptr0 += num_clmn;
			}
		}

		/* Step 4: Read the entire E$ */
		for (j = 0; j < num_clmn; j++) {
			ptr0 = buf + j;
			for (i = 0; i < num_row; i++) {
				sum = *ptr0;
				ptr0 += num_clmn;
			}
		}

		/* Step 5: Write the original pattern to the entire E$ */
		for (j = 0; j < num_clmn; j++) {
			ptr0 = buf + j;
			for (i = 0; i < num_row; i++) {
				*ptr0 = pattern;
				ptr0 += num_clmn;
			}
		}

		/* Step 6: Read the entire E$ */
		for (j = 0; j < num_clmn; j++) {
			ptr0 = buf + j;
			for (i = 0; i < num_row; i++) {
				sum = *ptr0;
				if (sum != pattern) {
					ret = -1;
					goto cache_march1_exit;
				}
				ptr0 += num_clmn;
			}
		}
	}

cache_march1_exit:
	/* unlock(if lock was successful) and free the memory */
	if (lock_status) {
		lock_status = munlock((void *)buf, bufsize);
		if (lock_status) {
			/* Log an Fatal error? message */
			vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
			    "cache_miss munlock failed!!\n");
		}
	}
	vts_message(0, VERBOSE, NULL, MARCH_TEST_ROW_STRIDE_END);
	free(buf);
	return (ret);
}

/*
 * March test to flush out the stuck at and coupling failures
 * in the Ecache.
 */
static  int
cache_march_stress2(size_t cache_size, int num_loops)
{
	ulong_t *buf;
	size_t bufsize = cache_size;
	int ret = 0;
	int lock_status = 0;
	int	arr_size;

	ulong_t pattern, cpattern;
	ulong_t *ptr0;
	size_t num_words = cache_size / (size_t)sizeof (ulong_t);
	size_t j;
	volatile ulong_t sum, patnum; /* Loop variables */

	ulong_t pats[] = {
		0xFFFFFFFFFFFFFFFFL, 0xAAAAAAAAAAAAAAAAL,
		0xfefefefefefefefeL, 0xfdfdfdfdfdfdfdfdL,
		0xfbfbfbfbfbfbfbfbL, 0xf7f7f7f7f7f7f7f7L,
		0xefefefefefefefefL, 0xdfdfdfdfdfdfdfdfL,
		0x7f7f7f7f7f7f7f7fL, 0xf0f0f0f0f0f0f0f0L,
		0x3333333333333333L, 0xccccccccccccccccL
	};

	arr_size = sizeof (pats) / sizeof (ulong_t);
	vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
	    "L2 March test with column striding");

	vts_message(0, VERBOSE, NULL, MARCH_TEST_COLUMN_STRIDE_START);
	/* Alloc and lock the memory */
	buf = (ulong_t *)memalign((size_t)pagesize, (size_t)bufsize);
	if (buf == NULL) {
		return (ENOMEM);
	}
	lock_status = mlock((void *)buf, bufsize);

	while (num_loops--) {
		patnum = l2_getrandom(arr_size + 1);
		if (patnum < arr_size) {
			pattern = pats[patnum];
			cpattern = ~pattern;
		} else {
			pattern = (ulong_t)lrand48();
			cpattern = ~pattern;
		}

		/* Doing March test with word size patterns */

		/* Step 1: Write to the entire E$ one word at a time */
		for (j = 0; j < num_words; j++) {
			ptr0 = buf;
			*ptr0 = pattern;
			ptr0++;
		}

		/* Step 2: Read the entire E$ one word at a time */
		for (j = 0; j < num_words; j++) {
			ptr0 = buf;
			sum = *ptr0;
			ptr0++;
		}

		/* Step 3: Write pattern compliment to the entire E$ */
		for (j = 0; j < num_words; j++) {
			ptr0 = buf;
			*ptr0 = cpattern;
			ptr0++;
		}

		/* Step 4: Read the entire E$ */
		for (j = 0; j < num_words; j++) {
			ptr0 = buf;
			sum = *ptr0;
			ptr0++;
		}

		/* Step 5: Write the original pattern to the entire E$ */
		for (j = 0; j < num_words; j++) {
			ptr0 = buf;
			*ptr0 = pattern;
			ptr0++;
		}

		/* Step 6: Read the entire E$ and compare */
		for (j = 0; j < num_words; j++) {
			ptr0 = buf;
			sum = *ptr0;
			if (sum != pattern) {
				ret = -1;
				goto cache_march2_exit;
			}
			ptr0++;
		}
	}

cache_march2_exit:
	/* unlock(if lock was successful) and free the memory */
	if (lock_status) {
		lock_status = munlock((void *)buf, bufsize);
		if (lock_status) {
			/* Log an Fatal error? message */
			vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
			    "cache_miss munlock failed!!\n");
		}
	}
	vts_message(0, VERBOSE, NULL, MARCH_TEST_COLUMN_STRIDE_END);
	free(buf);
	return (ret);
}

/*
 * The Main cache stress function callable from the test main.
 *
 * Arguments : Number of loops
 * Returns : 0 on error, -1 on failure
 */
int
l2cache_stress(int num_loops, cache_info_t *cache_info)
{
	int rval = 0;
	size_t cache_size, line_size;
	ulong_t	*buf;
	int lock_status;
	int set_associativity;

	pagesize = (size_t)sysconf(_SC_PAGESIZE);

	cache_size = cache_info->ecache_size;
	line_size = cache_info->ecache_line_size;
	set_associativity = cache_info->ecache_associativity;

#ifdef L2CACHE_PERF_MONITOR
	if (l2_cpc_valid(&l2cache_cpc_ver, L2_CACHE_TEST_NAME,
	    L2_LIBCPC_MSG) == 0) {
		l2cache_cpc_valid = 1;
	}

	/*
	 * Initialize the performance monitering strings.
	 * US-III, US-III+ and US-IIIi all have the same
	 * performance counters.
	 *
	 * Use cpc_getcciname to get the printable description
	 * of processor  performance counter interfaces.
	 */
	if (l2cache_cpc_valid) {
		const char *cciname;

		cciname = cpc_getcciname(l2cache_cpc_ver);

		if (cciname && ((strncmp(cciname, "UltraSPARC III",
		    strlen("UltraSPARC III")) == 0) ||
		    (strncmp(cciname, "UltraSPARC III+ & IV",
		    strlen("UltraSPARC III+ & IV")) == 0))) {
			/* for US-III/US-III+/US-IV cpus */
			l2wr_perf_string = "pic0=EC_ref,pic1=EC_misses";
			l2rd_perf_string = "pic0=EC_ref,pic1=EC_misses";
			l2_pic0_name = "EC_ref";
			l2_pic1_name = "EC_misses";
		} else if (cciname) { /* UltraSParc I&II */
			l2wr_perf_string = "pic0=EC_ref,pic1=EC_hit";
			l2rd_perf_string = "pic0=EC_ref,pic1=EC_hit";
			l2_pic0_name = "EC_ref";
			l2_pic1_name = "EC_hit";
		} else {
			/* This should NOT happen */
			vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
			    "cpc_getcciname failed!!\n");
			l2cache_cpc_valid = 0;
		}
	}
#endif

	vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
	    "Cache Access stress for cache_size %x line_size %x"
	    " num_loops %d\n", cache_size, line_size, num_loops);

	/* Cache hit stress */
	if ((rval = cache_hit_stress(cache_size, line_size, num_loops))
	    != 0) {
		return (rval);
	}

	/* Cache miss stress */
	if ((rval = cache_miss_stress(cache_size, line_size,
			set_associativity, num_loops))
	    != 0) {
		return (rval);
	}

	/* March testing to flush out stuck at and coupling faults */
	if ((rval = cache_march_stress1(cache_size, line_size, num_loops))
	    != 0) {
		return (rval);
	}

	/* March testing to flush out stuck at and coupling faults */
	if ((rval = cache_march_stress2(cache_size, num_loops))
	    != 0) {
		return (rval);
	}

	/*
	 * 17N March test writing to one complete cache line at a
	 * time. The test uses ldx/stx commands for writing and
	 * reading. There is not checking in the subtest. Depends on
	 * ECC checking to catch the failures.
	 */
	if ((rval = cache_march_stress3(cache_size, line_size, num_loops))
		!= 0) {
		return (rval);
	}

	/* Random swap testing */
	if ((rval = crand(cache_size)) != 0) {
		return (rval);
	}

	/*
	 * Random offset testing
	 */

	/* Alloc and lock the memory */
	buf = (ulong_t *)memalign((size_t)pagesize,
	    (size_t)cache_size);
	if (buf == NULL) {
		return (ENOMEM);
	}
	lock_status = mlock((void *)buf, cache_size);
	if (lock_status) {
		vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
		    "cache_miss mlock failed!!\n");
	}

	if ((rval = cache_rand_off_stress(buf, cache_size, num_loops))
	    != 0) {
		return (rval);
	}

	/* unlock(if lock was successful) and free the memory */
	if (lock_status) {
		lock_status = munlock((void *)buf, cache_size);
		if (lock_status) {
			/* Log an Fatal error? message */
			vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
			    "cache_miss munlock failed!!\n");
		}
	}
	free(buf);

	return (rval);
}
/*
 * Copyright 2002 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident "%W% %E% SMI"

/*
 * This file contains test loops for stressing the Level 2 data cache.
 * It currently contains the following :
 * 1. Hit Loops
 *	1.1 Read hits
 *		Write (first write to line miss, followed by hits in line)
 *		Read in increments of line size(all hits to L2)
 *		Write in increments of line size(all hits to L2)
 *	1.2 Write hits
 *		Write the complete buffer of size equal to cache size. Repeat
 *		This is done for ulong and double data types.
 *	1.3 50% read hits
 *		Read in increments of line size / 2. First read will be a miss
 *		followed by a hit.
 *	1.4 Derived data type hits
 *		Write and Write-Read-Verify loops with a derived datatype
 *		This accesses the L2 cache with different allignments.
 *
 * 2. Miss Loops
 *	2.1. Write miss loop
 *	2.2. Read miss loop
 *	2.3. Write miss read loop
 *	2.4. Read miss write loop
 *
 *
 * 3. Set Miss Loops  : Access all lines for the set before accessing next
 *	set. Operations tested are :
 *	  3.1. Write set miss loop
 *	  3.2. Read set miss loop
 *	  3.3. Write miss Read loop
 *	  3.4. Read miss Write loop
 *
 *
 * 4. Random offset loops : Test loops which access a buffer at random
 *    locations. The operations for random access loop are :
 *	 4.1 Write
 *	 4.2 Write-Read-Verify
 *	 4.3 Read
 *
 * TBD :
 *	basic_dtype_l2cachehit_stress : datatypes char, short, int
 *
 */

#include	<sys/types.h>
#include	<sys/errno.h>
#include	<sys/time.h>
#include	<signal.h>
#include	<stdlib.h>
#include	<sys/mman.h>
#include	<unistd.h>
#include	<stdio.h>
#include	<testinfo.h>
#include	<note.h>
#include	"l2_util.h"
#include	"l2cache.h"

#define	L2_CACHE_TEST_NAME	"l2cache"

/*
 * Defination for derived data type for derived data type tests
 */
#pragma pack(1)
typedef struct derived_dtype {
	char	char_elem;
	short	short_elem;
	int	int_elem;
	ulong_t	long_elem;
} derived_dtype_s;
#pragma pack()
typedef derived_dtype_s *derived_dtype_t;

/* Pagesize for this system */
static size_t pagesize;

static int basic_dtype_l2cachehit_stress(size_t bufsize,
    int number_loops);
static int l2cachehit_wrrdwr(size_t bufsize, size_t line_size,
    int num_loops);
static int derived_dtype_write_read_verify(derived_dtype_t buf,
    size_t bufsize, int num_loops, derived_dtype_t val);
static int derived_dtype_write(derived_dtype_t buf,
    size_t bufsize, int num_loops, derived_dtype_t val);
static int cache_writemiss(ulong_t *buf, size_t bufsize,
    size_t row_size, int num_loops, ulong_t write_val);
static int cache_readmiss(ulong_t *buf, size_t bufsize,
    size_t row_size, int num_loops);
static int cache_readmiss_write(ulong_t *buf, size_t bufsize,
    size_t row_size, int num_loops, ulong_t write_val);
static int cache_writemiss_read(ulong_t *buf, size_t bufsize,
    size_t row_size, int num_loops, ulong_t write_val);
static int cache_sets_trash(ulong_t *buf, size_t cache_size,
    size_t line_size, int set_associativity, int num_loops,
    ulong_t write_val, int operation);
static int rand_off_wr_loop(ulong_t *buf, size_t bufsize,
    size_t numloops, ulong_t write_val);
static int rand_off_wrrdvrfy_loop(ulong_t *buf, size_t bufsize,
    int numloops, ulong_t write_val);
static int rand_off_rd_loop(ulong_t *buf, size_t bufsize,
    int numloops);
static	int cache_march_stress1(size_t cache_size,
    size_t line_size, int num_loops);
static  int cache_march_stress2(size_t cache_size, int num_loops);

/*
 * Optimization for all test loops to be minimal.
 */
#pragma	opt 1	(basic_dtype_l2cachehit_stress, l2cachehit_wrrdwr)
#pragma	opt 1	(derived_dtype_write_read_verify, derived_dtype_write)
#pragma	opt 1	(cache_writemiss, cache_readmiss, cache_writemiss_read)
#pragma	opt 1	(cache_sets_trash, rand_off_wr_loop)
#pragma	opt 1	(rand_off_wrrdvrfy_loop, rand_off_rd_loop)
#pragma	opt 1	(cache_march_stress1, cache_march_stress2)

/* flags to differentiate the cache sets trash loops */
#define		LOOP_BASE	0
#define		WR_MISS		((LOOP_BASE)+1)
#define		RD_MISS		((LOOP_BASE)+2)
#define		WR_MISS_RD	((LOOP_BASE)+3)
#define		RD_MISS_WR	((LOOP_BASE)+4)

/* Cache random swapping function */
extern  int crand(ulong_t);
extern  int cache_march_stress3(size_t, size_t, int);

#ifdef L2CACHE_PERF_MONITOR
/*
 * Processor event string for DC reads and write statistics
 * This strings are supported on Ultra Sparc-I,II,III
 * TBD : Initialize these strings dynamically, based on the
 * processor type.
 */
static char *l2wr_perf_string;
static char *l2rd_perf_string;

/* Event string for display only */
static char *l2_pic0_name;
static char *l2_pic1_name;

/*
 * l2cache_cpc_valid - Set if CPC is valid for this processor
 * l2cache_perf_valid- Set if cpc_setup and  get_sample for
 *		before was obtained correctly
 * l2cache_cpc_ver - CPU CPC version
 */
static int l2cache_cpc_valid = 0;
static int l2cache_perf_valid;
static int l2cache_cpc_ver;

#define	L2CACHE_CPC_SETUP(perf_string)	\
	if (l2cache_cpc_valid) {	\
		l2cache_perf_valid =	\
		    (l2cache_cpc_setup((perf_string),	\
		    l2cache_cpc_ver) != -1);	\
		if (l2cache_perf_valid &&	\
		    cpc_take_sample(&before) == -1) {	\
			l2cache_perf_valid = 0;	\
			vts_message(	\
			    NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,	\
			    "In cpc_take_sample(before)\n");	\
		}	\
	}

#define	L2CACHE_CPC_AFTER(msg_prefix)	\
	if (l2cache_cpc_valid && l2cache_perf_valid) {	\
		if (cpc_take_sample(&after) == -1) {	\
			vts_message(	\
			    NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,	\
			    "In cpc_take_sample(after)\n");	\
		} else {	\
			cpc_event_diff(&diff, &after, &before);	\
			diff.ce_pic[1] = after.ce_pic[1] -	\
				before.ce_pic[1];	\
			diff.ce_hrt = after.ce_hrt - before.ce_hrt;	\
			vts_message(	\
			    NO_EXIT, VERBOSE, L2_CACHE_TEST_NAME, 0,	\
			    "L2 cache %s%s %llu, %s %llu\n",	\
			    (msg_prefix), l2_pic0_name,	\
			    diff.ce_pic[0], l2_pic1_name,	\
			    diff.ce_pic[1]);	\
		}	\
	}
#else
#define	L2CACHE_CPC_SETUP(perf_string)
#define	L2CACHE_CPC_AFTER(msg_prefix)
#endif	/* L2CACHE_PERF_MONITOR */


/*
 * L2 cache hit loops
 * --------------------
 * L2 Cache Write/Write-Read-Verify cache hit test loops.
 * These test loops are common for all basic data types.  These test
 * loops continue for the number of test loops specified as an
 * argument.
 *
 * The general format of test loop are as below :
 *	for num_loops do
 *		Continue till buffer limit is reached
 *			<Test-operation>
 *
 * These defines SHOULD be used as inline code. The following
 * SHOULD be passed as parameters :
 *	1. datatype	- Any of the basic datatype
 *	2. datatype	*buf
 *	4. int		ret_val : Return Value
 *	5. datatype	val : Value to write / Read verify
 *	6. int		num_loops
 *
 * ASSUMPTION : Only called once in a function for a datatype.
 *              Multiple calls WILL result in compile errors
 *
 * Test Loops
 *		Write
 *		Write-Read-Verify
 */

#define	l2cachehit_write(datatype, buf, ret_val,	\
    val, num_loops) \
	{ \
		int limit = sizeof (buf) / sizeof (##datatype);	\
		int direction = 1;	\
##datatype *localbuf = (buf);	\
		int cnt = num_loops;	\
		while (cnt--) { \
			int index = limit;	\
			while (index--) {	\
				*localbuf = (val);	\
				localbuf += direction;	\
			}	\
			direction *= -1;	\
		}	\
		(ret_val) = 0;	\
	}

#define	l2cachehit_write_read_verify(datatype, buf, \
    ret_val, val, num_loops) \
	{ \
##datatype	*localbuf = (buf);	\
		int direction = 1;	\
		int limit = sizeof (buf) / sizeof (##datatype);	\
		int cnt = num_loops;	\
		while (cnt--) { \
			int index = limit;	\
			while (index--) {	\
					*localbuf = val;	\
					if (*localbuf != val) {	\
						(ret_val) = -1;	\
						goto wr_rd_vrfy_##datatype; \
				}	\
				localbuf += direction;	\
			}	\
			direction *= -1;	\
		}	\
		ret_val = 0; \
wr_rd_vrfy_##datatype:	\
		;	\
	}

/*
 * Perform hit test loops for all REQUIRED basic data types,
 * for the specified buffer size.
 *
 * Currently this supports ulong and double
 */
static int
basic_dtype_l2cachehit_stress(size_t bufsize, int number_loops)
{
	void	*buf;
	ulong_t	*ulong_buf;
	ulong_t	write_ulong = (ulong_t)lrand48();
	double	*double_buf;
	double	write_double = drand48();
	int	rc = 0;
	int lock_status = 0;

	/* Allocate and lock a buffer for all loops */
	buf = (ulong_t *)memalign(pagesize, (size_t)bufsize);
	if (buf == NULL) {
		return (ENOMEM);
	}
	lock_status = mlock((void *)buf, bufsize);
	if (lock_status) {
		/* Log an error message(Notice) that locking failed */
		vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
		    "basic_dtype, ulong mlock failed!!\n");
	}
	ulong_buf = (ulong_t *)buf;
	double_buf = (double *)buf;

	/* Do basic data type test  loops for ulong_t */
	l2cachehit_write(ulong_t, ulong_buf, rc,
	    write_ulong, number_loops);
	if (rc)
		goto basic_dtype_exit;

	write_ulong *= 2;	/* Change write_ulong */

	l2cachehit_write_read_verify(ulong_t, ulong_buf,
	    rc, write_ulong, number_loops);
	if (rc)
		goto basic_dtype_exit;

	/* Do basic data type test  loops for double */
	l2cachehit_write(double, double_buf,
	    rc, write_double, number_loops);
	if (rc)
		goto basic_dtype_exit;

	write_double *= 2;
	l2cachehit_write_read_verify(double, double_buf,
	    rc, write_double, number_loops);
	if (rc)
		goto basic_dtype_exit;

basic_dtype_exit:
	/* unlock the memory */
	if (lock_status) {
		lock_status = munlock((void *)double_buf, bufsize);
		if (lock_status) {
			/* Log an Fatal error? message */
			vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME,
			    0, "basic_dtype munlock failure!!\n");
		}
	}

	free(double_buf);
	return (rc);
}

/*
 * This is a Read miss loop followed by  Read hit loop and Write hit
 * loop. This is currently supported only for ulong_t datatype. There
 * apparently does not seem to be any value add by extending this for
 * other datatypes.
 */
static int
l2cachehit_wrrdwr(size_t bufsize, size_t line_size, int num_loops)
{
	ulong_t	write_val;
	ulong_t	*buf;
	int	lock_status = 0;
	ulong_t	limit;
	int	loop_cnt;
	ulong_t	i;

	write_val = (ulong_t)lrand48();
	limit = bufsize / sizeof (ulong_t);
	loop_cnt = num_loops;

	/* Allocate and lock the buffer for test loop */
	buf = (ulong_t *)memalign((size_t)pagesize,
	    (size_t)bufsize);
	if (buf == NULL) {
		return (ENOMEM);
	}
	lock_status = mlock((void *)buf, bufsize);
	if (lock_status) {
		/* Log an error message(Notice) that locking failed */
		vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
		    "l2cachehit_wrrdwr, mlock failed!!\n");
	}

	/*
	 * 1. Fill up the buffer by writes, First write in line will be a miss,
	 *    the rest will be hits. As Dcache is not write allocate, all the
	 *    data will be directly written to L2 cache.
	 * 2. Read in increments of Ecache line size, This will be hits to L2,
	 *    as the writes have filled in the buffer.
	 * 3. Write in increments of line_size. This will also be hits.
	 */
	for (i = 0; i < limit; i++) {
		buf[i] = write_val;
	}
	while (loop_cnt--) {
		for (i = 0; i < limit; i += line_size) {
			write_val += buf[i];
		}
	}
	loop_cnt = num_loops;
	while (loop_cnt--) {
		for (i = 0; i < limit; i += line_size) {
			buf[i] = write_val;
		}
	}

	/* unlock the memory */
	if (lock_status) {
		lock_status = munlock((void *)buf, bufsize);
		if (lock_status) {
			/* Log an Fatal error? message */
			vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME,
			    0, "l2cachehit_wrrdwr: munlock failed!!\n");
		}
	}
	free(buf);

	return (0);
}

/*
 * Derived datatype tests. This contains Write & Write-Read-Verify
 */
static int
derived_dtype_write_read_verify(derived_dtype_t buf, size_t bufsize,
    int num_loops, derived_dtype_t val)
{
	ulong_t limit = bufsize / sizeof (derived_dtype_s);
	int direction = 1;

	while (num_loops--) {
		ulong_t index = limit;

		while (index--) {
			buf->char_elem = val->char_elem;
			buf->short_elem = val->short_elem;
			buf->int_elem = val->int_elem;
			buf->long_elem = val->long_elem;

			if (buf->char_elem != val->char_elem ||
			    buf->short_elem != val->short_elem ||
			    buf->int_elem != val->int_elem ||
			    buf->long_elem != val->long_elem) {
				return (-1);
			}
			if (index && direction == 1)
				buf++;
			else if (index && direction == 0)
				buf--;
		}
		if (direction)
			direction = 0;
		else
			direction = 1;
	}
	return (0);
}

static int
derived_dtype_write(derived_dtype_t buf, size_t bufsize, int num_loops,
    derived_dtype_t val)
{
	ulong_t limit = bufsize / sizeof (derived_dtype_s);
	int direction = 1;

	while (num_loops--) {
		ulong_t index = limit;

		while (index--) {
			buf->char_elem = val->char_elem;
			buf->short_elem = val->short_elem;
			buf->int_elem = val->int_elem;
			buf->long_elem = val->long_elem;

			if (index && direction == 1)
				buf++;
			else if (index && direction == 0)
				buf--;
		}
		if (direction)
			direction = 0;
		else
			direction = 1;
	}
	return (0);
}

static int
derived_dtype_test(size_t cache_size, int num_loops)
{
	derived_dtype_t buf;
	derived_dtype_s val;
	int lock_status;
	int rc = 0;

	/* Allocate and lock buffer for test loop */
	buf = (derived_dtype_t)memalign((size_t)pagesize,
	    (size_t)cache_size);
	if (buf == NULL) {
	    return (ENOMEM);
	}
	lock_status = mlock((void *)buf, cache_size);
	if (lock_status) {
		/* Log an error message(Notice) that locking failed */
		vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
		    "derived_dtype mlock failed!!\n");
	}

	if ((rc = derived_dtype_write(buf, cache_size, num_loops,
	    &val)) != 0) {
		goto derived_test_exit;
	}
	if ((rc = derived_dtype_write_read_verify(buf, cache_size, num_loops,
	    &val)) != 0) {
		goto derived_test_exit;
	}

derived_test_exit:
	if (lock_status) {
		/* unlock the memory */
		lock_status = munlock((void *)buf, cache_size);
		if (lock_status) {
			/* Log an Fatal error? message */
			vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
			    "derived_dtype munlock failed!!\n");
		}
	}
	free(buf);

	return (rc);
}

static int cache_readmiss(ulong_t *buf, size_t bufsize,
    size_t row_size, int num_loops);

/*
 * The main cache hit stress function.
 *
 * The line size and cache size are passed to this.
 */
static int
cache_hit_stress(size_t cache_size, size_t line_size, int num_loops)
{
	int rval = 0;
	ulong_t *buf;
	int lock_status;
	int bufsize;

	/*
	 * Do test loop(s) for combined data types. This loops may give cache
	 * access rates lower because of the some compare loops.
	 */
	if ((rval = basic_dtype_l2cachehit_stress(cache_size, num_loops))
	    != 0) {
		if (rval != ENOMEM)
			return (rval);
	}

	/*
	 * Do the test loop for write-read-Write hits - TBD
	 */
	if ((rval = l2cachehit_wrrdwr(cache_size, line_size, num_loops))
	    != 0) {
		if (rval != ENOMEM)
			return (rval);
	}

	/* Do the derived data type hit loop */
	if ((rval = derived_dtype_test(cache_size, num_loops)) != 0) {
		if (rval != ENOMEM)
			return (rval);
	}

	/*
	 * Read miss loop with increment of linesize / 2, this shall
	 * cause 50% hit loop.
	 */
	bufsize = cache_size;
	buf = (ulong_t *)memalign((size_t)pagesize,
	    (size_t)bufsize);
	if (buf == NULL) {
		return (0);
	}
	lock_status = mlock((void *)buf, bufsize);
	if (lock_status) {
		vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
		    "cache_miss mlock failed!!\n");
	}

	rval = cache_readmiss(buf, bufsize, line_size / 2, num_loops);

	/* unlock(if lock was successful) and free the memory */
	if (lock_status) {
		lock_status = munlock((void *)buf, bufsize);
		if (lock_status) {
			/* Log an Fatal error? message */
			vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
			    "cache_miss munlock failed!!\n");
		}
	}
	free(buf);

	if (rval != ENOMEM)
		return (rval);
	else
		return (0);
}

/*
 * Cache write miss loops.
 *
 * These loops take row_size as a parameter. This can be either line size
 * or multiple of it. Accessing a buffer in incremnts of line size causes
 * a miss. Accessing buffer with a different increment causes miss in a
 * different manner.
 */
static int
cache_writemiss(ulong_t *buf, size_t bufsize, size_t row_size, int num_loops,
    ulong_t write_val)
{
	ulong_t	*ptr0;
	ulong_t	num_row, num_clmn;	/* Number of Rows and Columns */
	ulong_t	i, j;	/* Loop variables */

	num_clmn = row_size / sizeof (ulong_t);
	num_row = bufsize / row_size;

	while (num_loops--) {
		for (j = 0; j < num_clmn; j++) {
			ptr0 = buf + j;
			for (i = 0; i < num_row; i++) {
				*ptr0 = write_val;
				ptr0 += num_clmn;
			}
		}
		write_val *= 2;	/* Change the write value */
	}

	return (0);
}

static int
cache_readmiss(ulong_t *buf, size_t bufsize, size_t row_size, int num_loops)
{
	volatile ulong_t *ptr0;
	ulong_t	num_row, num_clmn;	/* Number of Rows and Columns */
	ulong_t	i, j;	/* Loop variables */
	volatile ulong_t sum;

	sum = (ulong_t)lrand48();

	num_clmn = row_size / sizeof (ulong_t);
	num_row = bufsize / row_size;

	while (num_loops--) {
		for (j = 0; j < num_clmn; j++) {
			ptr0 = buf + j;
			for (i = 0; i < num_row; i++) {
				sum += *ptr0;
				ptr0 += num_clmn;
			}
		}
	}
	*buf = sum;

	return (0);
}

static int
cache_readmiss_write(ulong_t *buf, size_t bufsize, size_t row_size,
    int num_loops, ulong_t write_val)
{
	volatile ulong_t	*ptr0;
	ulong_t	num_row, num_clmn;	/* Number of Rows and Columns */
	ulong_t	i, j;	/* Loop variables */

	num_clmn = row_size / sizeof (ulong_t);
	num_row = bufsize / row_size;

	while (num_loops--) {
		for (j = 0; j < num_clmn; j++) {
			ptr0 = buf + j;
			for (i = 0; i < num_row; i++) {
				*ptr0 *= write_val;
				ptr0 += num_clmn;
			}
		}
	}

	return (0);
}

static int
cache_writemiss_read(ulong_t *buf, size_t bufsize, size_t row_size,
    int num_loops, ulong_t write_val)
{
	ulong_t	*ptr0;
	ulong_t	num_row, num_clmn;	/* Number of Rows and Columns */
	ulong_t	i, j;	/* Loop variables */
	volatile ulong_t sum;

	sum = (ulong_t) lrand48(); 	/* Init sum for lint */
	num_clmn = row_size / sizeof (ulong_t);
	num_row = bufsize / row_size;

	while (num_loops--) {
		for (j = 0; j < num_clmn; j++) {
			ptr0 = buf + j;
			for (i = 0; i < num_row; i++) {
				*ptr0 = write_val;
				/* Change the write value */
				write_val *= 2;
				sum += *ptr0;
				ptr0 += num_clmn;
			}
		}
	}
	*buf = sum;	/* Use it to avoid optimization */

	return (0);
}

static int
cache_sets_trash(ulong_t *buf, size_t cache_size, size_t line_size,
    int set_associativity, int num_loops, ulong_t write_val,
    int operation)
{

	size_t num_of_sets;
	size_t jump_by_line;
	size_t jump_by_set_next_line;
	ulong_t *ptr, *buf_line0;
	volatile ulong_t sum = 0;
	int i, j, count;

	buf_line0 = buf;
	num_of_sets = (cache_size / (line_size * set_associativity));
	jump_by_line = line_size / (size_t)sizeof (ulong_t);
	/* Jump next line in the same set */
	jump_by_set_next_line = (line_size * num_of_sets) /
	    (size_t)sizeof (ulong_t);

	/*
	 * The inner loops (while and for) will trash all lines
	 * in the same set for for count nymber of times
	 */
	for (i = 0; i < num_of_sets; i++) {
	    count  = num_loops;
	    while (count--) {
			ptr = buf_line0;
			for (j = 0; j < set_associativity; j++) {
				switch (operation) {
					case WR_MISS:
						*ptr = write_val;
						ptr += jump_by_set_next_line;
						/* Change write value */
						write_val *= 2;
						break;

					case RD_MISS:
						sum += *ptr;
						ptr += jump_by_set_next_line;
						break;

					case WR_MISS_RD:
						*ptr = write_val;
						/* Change write value */
						write_val *= 2;
						sum = *ptr;
						ptr += jump_by_set_next_line;
						break;

					case RD_MISS_WR:
						(*ptr) *= write_val;
						ptr += jump_by_set_next_line;
						break;
				}
			}
	    }
		buf_line0 += jump_by_line;
	}
	return (0);
}


/*
 * The main cache miss stress  function. This contains the following test loops
 * 1. Matrix with row size equal to line size
 *
 * The test loop is as below :
 *	ulong_t *ptr0 = buf of size  bufsize = 2 * cache_size
 *	num_clmn = row_size / sizeof(ulong)
 *	num_row = bufsize / rowsize
 *	for (j = 0; j < num_clmn; j++)
 *		ptr0 = buf + j;
 *		for (i = 0; i < num_row; i++)
 *			<OPERATE> *buf
			buf += num_clmn
 *
 * The inner loop jumps in terms of rows. This will cause an element of the
 * matrix with offset = row_size(either line size or cache_size), thus
 * causing a miss.
 *
 * The buffer size is set to 2 times cache size because :
 *	1. A end of inner loop the first part of the buffer(size = cachesize)
 *	   is out of the cache. The access to the start of the buffer will be
 *	   a miss.
 *	2. The same loop can be repeated N times causing same miss pattern. If
 *	   the buffer size is equal to cache size, the second iteration of the
 *	   loop will be a hit, as nothing else has replaced it(Assuming
 *	   standalone operation of the test loop.
 *	3. Assuming that the number of loops is a multiple of two, the same
 *	   buffer can be used to test different operations(write, read, etc.).
 *
 * Operations supported : Write miss, Read miss, Read miss followed by write &
 *	Write miss followed by read.
 */
static int
cache_miss_stress(size_t cache_size, size_t line_size,
	int set_associativity, int num_loops)
{
	ulong_t *buf;
	int bufsize = cache_size * 2;
	int lock_status;
	ulong_t write_value;
	int rc = 0;
#ifdef L2CACHE_PERF_MONITOR
	cpc_event_t before, after, diff;
#endif	/* L2CACHE_PERF_MONITOR */

	/* Alloc and lock the memory */
	buf = (ulong_t *)memalign((size_t)pagesize,
	    (size_t)bufsize);
	if (buf == NULL) {
		return (ENOMEM);
	}
	lock_status = mlock((void *)buf, bufsize);
	if (lock_status) {
		vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
		    "cache_miss mlock failed!!\n");
	}

	write_value = (ulong_t)lrand48();

	/*
	 * Line size miss
	 */
	L2CACHE_CPC_SETUP(l2wr_perf_string);
	if ((rc = cache_writemiss(buf, bufsize, line_size, num_loops,
	    write_value)) != 0) {
		goto cache_miss_exit;
	}
	L2CACHE_CPC_AFTER("Write miss loop : ");

	L2CACHE_CPC_SETUP(l2rd_perf_string);
	if ((rc = cache_readmiss(buf, bufsize, line_size, num_loops))
	    != 0) {
		goto cache_miss_exit;
	}
	L2CACHE_CPC_AFTER("Read miss loop : ");

	L2CACHE_CPC_SETUP(l2rd_perf_string);
	if ((rc = cache_readmiss_write(buf, bufsize, line_size, num_loops,
	    write_value)) != 0) {
		goto cache_miss_exit;
	}
	L2CACHE_CPC_AFTER("Read miss write loop : ");

	L2CACHE_CPC_SETUP(l2wr_perf_string);
	if ((rc = cache_writemiss_read(buf, bufsize, line_size, num_loops,
	    write_value)) != 0) {
		goto cache_miss_exit;
	}
	L2CACHE_CPC_AFTER("Write miss read loop : ");


	/*
	 * Trash all the sets in a cache
	 * If set associativity is greater to 1, then trash
	 * all the sets in cache. Otherwise its a direct-mapped
	 * cache in which each line size is equivalent to one set
	 * for which we have loops above.
	 */
	if (set_associativity <= 1) {
		return (rc);
	}

	L2CACHE_CPC_SETUP(l2wr_perf_string);
	if ((rc = cache_sets_trash(buf, bufsize, line_size,
	    set_associativity, num_loops, write_value,
	    WR_MISS)) != 0) {
		goto cache_miss_exit;
	}
	L2CACHE_CPC_AFTER("sets write miss loop : ");

	L2CACHE_CPC_SETUP(l2rd_perf_string);
	if ((rc = cache_sets_trash(buf, bufsize, line_size,
	    set_associativity, num_loops, write_value,
	    RD_MISS)) != 0) {
		goto cache_miss_exit;
	}
	L2CACHE_CPC_AFTER("sets read miss loop : ");

	L2CACHE_CPC_SETUP(l2wr_perf_string);
	if ((rc = cache_sets_trash(buf, bufsize, line_size,
	    set_associativity, num_loops, write_value,
	    WR_MISS_RD)) != 0) {
		goto cache_miss_exit;
	}
	L2CACHE_CPC_AFTER("sets write miss read loop : ");

	L2CACHE_CPC_SETUP(l2rd_perf_string);
	if ((rc = cache_sets_trash(buf, bufsize, line_size,
	    set_associativity, num_loops, write_value,
	    RD_MISS_WR)) != 0) {
		goto cache_miss_exit;
	}
	L2CACHE_CPC_AFTER("sets read miss write loop : ");



cache_miss_exit:
	/* unlock(if lock was successful) and free the memory */
	if (lock_status) {
		lock_status = munlock((void *)buf, bufsize);
		if (lock_status) {
			/* Log an Fatal error? message */
			vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
			    "cache_miss munlock failed!!\n");
		}
	}
	free(buf);

	return (rc);
}

/*
 * Random offset access test loops:
 *		These test loops exercise the cache in a random manner,
 *		causing cache hits and misses in an random manner.
 *
 *		Currently writeonly, readonly and wr-read-verify loops
 *		are supported. Write followed by Read verify cannot be
 *		tested as write at random locations does NOT guarantee
 *		that the complete buffer is written. The read-verify
 *		at a location which was not written WILL fail!!!
 *		Read verify can be done with fully written buffer.
 */
static int
rand_off_wr_loop(ulong_t *buf, size_t bufsize, size_t numloops,
	ulong_t write_val)
{
	ulong_t *rand_buf;
	size_t	rand_bufsz, i;

	/*
	 * Fill the random buffer with random offsets. The offsets
	 * are limited by ]0 - (bufsize -1)].
	 */
	rand_buf = l2_get_randbuf(bufsize, &rand_bufsz);
	if (rand_buf == NULL) {
		return (ENOMEM);
	}

	while (numloops--) {
		for (i = 0; i < rand_bufsz; i++) {
			buf[rand_buf[i]] = write_val;
		}
		write_val *= 2;
	}
	free(rand_buf);
	return (0);
}

static int
rand_off_wrrdvrfy_loop(ulong_t *buf, size_t bufsize, int numloops,
	ulong_t write_val)
{
	ulong_t *rand_buf;
	size_t	rand_bufsz, i;

	/*
	 * Fill the random buffer with random offsets. The offsets
	 * are limited by ]0 - (bufsize -1)].
	 */
	rand_buf = l2_get_randbuf(bufsize, &rand_bufsz);
	if (rand_buf == NULL) {
		return (ENOMEM);
	}

	while (numloops--) {
		for (i = 0; i < rand_bufsz; i++) {
			buf[rand_buf[i]] = write_val;
			if (buf[rand_buf[i]] != write_val) {
				return (-1);
			}
		}
		write_val *= 2;
	}
	free(rand_buf);
	return (0);
}

static int
rand_off_rd_loop(ulong_t *buf, size_t bufsize, int numloops)
{
	ulong_t *rand_buf;
	size_t	rand_bufsz, i;
	volatile ulong_t read_sum;

	/*
	 * Fill the random buffer with random offsets. The offsets
	 * are limited by ]0 - (bufsize -1)].
	 */
	rand_buf = l2_get_randbuf(bufsize, &rand_bufsz);
	if (rand_buf == NULL) {
		return (ENOMEM);
	}
	read_sum = rand_bufsz;

	while (numloops--) {
		for (i = 0; i < rand_bufsz; i++) {
			read_sum += buf[rand_buf[i]];
		}
	}
	free(rand_buf);
	return (0);
}

static int
cache_rand_off_stress(ulong_t *buf, size_t bufsize, int numloops)
{
	int rval;
	ulong_t write_val = (ulong_t)lrand48();

	bufsize /= sizeof (ulong_t);
	if ((rval = rand_off_wr_loop(buf, bufsize, numloops,
	    write_val)) != 0) {
		return (rval);
	}

	write_val = (ulong_t)lrand48();
	if ((rval = rand_off_wrrdvrfy_loop(buf, bufsize, numloops,
	    write_val)) != 0) {
		return (rval);
	}

	if ((rval = rand_off_rd_loop(buf, bufsize, numloops))
	    != 0) {
		return (rval);
	}

	return (0);
}

/*
 * March test to flush out the stuck at and coupling failures
 * in the Ecache.
 */
static	int
cache_march_stress1(size_t cache_size, size_t line_size, int num_loops)
{
	ulong_t	*buf;
	int bufsize	= cache_size;
	int	ret = 0;
	int	lock_status = 0;

	ulong_t		pattern, cpattern;
	ulong_t 	*ptr0;
	ulong_t 	num_row, num_clmn; /* Num. of rows and columns */
	ulong_t 	i, j, sum, patnum;   /* Loop variables */
	int		arr_size;

	ulong_t	pats[] = {
		0xFFFFFFFFFFFFFFFFL, 0xAAAAAAAAAAAAAAAAL,
		0xfefefefefefefefeL, 0xfdfdfdfdfdfdfdfdL,
		0xfbfbfbfbfbfbfbfbL, 0xf7f7f7f7f7f7f7f7L,
		0xefefefefefefefefL, 0xdfdfdfdfdfdfdfdfL,
		0x7f7f7f7f7f7f7f7fL, 0xf0f0f0f0f0f0f0f0L,
		0x3333333333333333L, 0xccccccccccccccccL
	};

	arr_size = sizeof (pats) / sizeof (ulong_t);
	vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
	    "L2 March test with row striding\n");

	vts_message(0, VERBOSE, NULL, MARCH_TEST_ROW_STRIDE_START);

	/* Alloc and lock the memory */
	buf = (ulong_t *)memalign((size_t)pagesize, (size_t)bufsize);
	if (buf == NULL) {
		return (ENOMEM);
	}
	lock_status = mlock((void *)buf, bufsize);
	num_clmn = line_size / sizeof (ulong_t);
	num_row = bufsize / line_size;

	while (num_loops--) {
		patnum = l2_getrandom(arr_size + 1);
		if (patnum < arr_size) {
			pattern = pats[patnum];
			cpattern = ~pattern;
		} else {
			pattern = (ulong_t)lrand48();
			cpattern = ~pattern;
		}

		/* Doing March test with word size patterns */
		/* Step 1: Write to the entire E$ one word at a time */
		for (j = 0; j < num_clmn; j++) {
			ptr0 = buf + j;
			for (i = 0; i < num_row; i++) {
				*ptr0 = pattern;
				ptr0 += num_clmn;
			}
		}


		/* Step 2: Read the entire E$ one word at a time */
		for (j = 0; j < num_clmn; j++) {
			ptr0 = buf + j;
			for (i = 0; i < num_row; i++) {
				sum = *ptr0;
				ptr0 += num_clmn;
			}
		}

		/* Step 3: Write pattern compliment to the entire E$ */
		for (j = 0; j < num_clmn; j++) {
			ptr0 = buf + j;
			for (i = 0; i < num_row; i++) {
				*ptr0 = cpattern;
				ptr0 += num_clmn;
			}
		}

		/* Step 4: Read the entire E$ */
		for (j = 0; j < num_clmn; j++) {
			ptr0 = buf + j;
			for (i = 0; i < num_row; i++) {
				sum = *ptr0;
				ptr0 += num_clmn;
			}
		}

		/* Step 5: Write the original pattern to the entire E$ */
		for (j = 0; j < num_clmn; j++) {
			ptr0 = buf + j;
			for (i = 0; i < num_row; i++) {
				*ptr0 = pattern;
				ptr0 += num_clmn;
			}
		}

		/* Step 6: Read the entire E$ */
		for (j = 0; j < num_clmn; j++) {
			ptr0 = buf + j;
			for (i = 0; i < num_row; i++) {
				sum = *ptr0;
				if (sum != pattern) {
					ret = -1;
					goto cache_march1_exit;
				}
				ptr0 += num_clmn;
			}
		}
	}

cache_march1_exit:
	/* unlock(if lock was successful) and free the memory */
	if (lock_status) {
		lock_status = munlock((void *)buf, bufsize);
		if (lock_status) {
			/* Log an Fatal error? message */
			vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
			    "cache_miss munlock failed!!\n");
		}
	}
	vts_message(0, VERBOSE, NULL, MARCH_TEST_ROW_STRIDE_END);
	free(buf);
	return (ret);
}

/*
 * March test to flush out the stuck at and coupling failures
 * in the Ecache.
 */
static  int
cache_march_stress2(size_t cache_size, int num_loops)
{
	ulong_t *buf;
	size_t bufsize = cache_size;
	int ret = 0;
	int lock_status = 0;
	int	arr_size;

	ulong_t pattern, cpattern;
	ulong_t *ptr0;
	size_t num_words = cache_size / (size_t)sizeof (ulong_t);
	size_t j;
	volatile ulong_t sum, patnum; /* Loop variables */

	ulong_t pats[] = {
		0xFFFFFFFFFFFFFFFFL, 0xAAAAAAAAAAAAAAAAL,
		0xfefefefefefefefeL, 0xfdfdfdfdfdfdfdfdL,
		0xfbfbfbfbfbfbfbfbL, 0xf7f7f7f7f7f7f7f7L,
		0xefefefefefefefefL, 0xdfdfdfdfdfdfdfdfL,
		0x7f7f7f7f7f7f7f7fL, 0xf0f0f0f0f0f0f0f0L,
		0x3333333333333333L, 0xccccccccccccccccL
	};

	arr_size = sizeof (pats) / sizeof (ulong_t);
	vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
	    "L2 March test with column striding");

	vts_message(0, VERBOSE, NULL, MARCH_TEST_COLUMN_STRIDE_START);
	/* Alloc and lock the memory */
	buf = (ulong_t *)memalign((size_t)pagesize, (size_t)bufsize);
	if (buf == NULL) {
		return (ENOMEM);
	}
	lock_status = mlock((void *)buf, bufsize);

	while (num_loops--) {
		patnum = l2_getrandom(arr_size + 1);
		if (patnum < arr_size) {
			pattern = pats[patnum];
			cpattern = ~pattern;
		} else {
			pattern = (ulong_t)lrand48();
			cpattern = ~pattern;
		}

		/* Doing March test with word size patterns */

		/* Step 1: Write to the entire E$ one word at a time */
		for (j = 0; j < num_words; j++) {
			ptr0 = buf;
			*ptr0 = pattern;
			ptr0++;
		}

		/* Step 2: Read the entire E$ one word at a time */
		for (j = 0; j < num_words; j++) {
			ptr0 = buf;
			sum = *ptr0;
			ptr0++;
		}

		/* Step 3: Write pattern compliment to the entire E$ */
		for (j = 0; j < num_words; j++) {
			ptr0 = buf;
			*ptr0 = cpattern;
			ptr0++;
		}

		/* Step 4: Read the entire E$ */
		for (j = 0; j < num_words; j++) {
			ptr0 = buf;
			sum = *ptr0;
			ptr0++;
		}

		/* Step 5: Write the original pattern to the entire E$ */
		for (j = 0; j < num_words; j++) {
			ptr0 = buf;
			*ptr0 = pattern;
			ptr0++;
		}

		/* Step 6: Read the entire E$ and compare */
		for (j = 0; j < num_words; j++) {
			ptr0 = buf;
			sum = *ptr0;
			if (sum != pattern) {
				ret = -1;
				goto cache_march2_exit;
			}
			ptr0++;
		}
	}

cache_march2_exit:
	/* unlock(if lock was successful) and free the memory */
	if (lock_status) {
		lock_status = munlock((void *)buf, bufsize);
		if (lock_status) {
			/* Log an Fatal error? message */
			vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
			    "cache_miss munlock failed!!\n");
		}
	}
	vts_message(0, VERBOSE, NULL, MARCH_TEST_COLUMN_STRIDE_END);
	free(buf);
	return (ret);
}

/*
 * The Main cache stress function callable from the test main.
 *
 * Arguments : Number of loops
 * Returns : 0 on error, -1 on failure
 */
int
l2cache_stress(int num_loops, cache_info_t *cache_info)
{
	int rval = 0;
	size_t cache_size, line_size;
	ulong_t	*buf;
	int lock_status;
	int set_associativity;

	pagesize = (size_t)sysconf(_SC_PAGESIZE);

	cache_size = cache_info->ecache_size;
	line_size = cache_info->ecache_line_size;
	set_associativity = cache_info->ecache_associativity;

#ifdef L2CACHE_PERF_MONITOR
	if (l2_cpc_valid(&l2cache_cpc_ver, L2_CACHE_TEST_NAME,
	    L2_LIBCPC_MSG) == 0) {
		l2cache_cpc_valid = 1;
	}

	/*
	 * Initialize the performance monitering strings.
	 * US-III, US-III+ and US-IIIi all have the same
	 * performance counters.
	 *
	 * Use cpc_getcciname to get the printable description
	 * of processor  performance counter interfaces.
	 */
	if (l2cache_cpc_valid) {
		const char *cciname;

		cciname = cpc_getcciname(l2cache_cpc_ver);

		if (cciname && ((strncmp(cciname, "UltraSPARC III",
		    strlen("UltraSPARC III")) == 0) ||
		    (strncmp(cciname, "UltraSPARC III+ & IV",
		    strlen("UltraSPARC III+ & IV")) == 0))) {
			/* for US-III/US-III+/US-IV cpus */
			l2wr_perf_string = "pic0=EC_ref,pic1=EC_misses";
			l2rd_perf_string = "pic0=EC_ref,pic1=EC_misses";
			l2_pic0_name = "EC_ref";
			l2_pic1_name = "EC_misses";
		} else if (cciname) { /* UltraSParc I&II */
			l2wr_perf_string = "pic0=EC_ref,pic1=EC_hit";
			l2rd_perf_string = "pic0=EC_ref,pic1=EC_hit";
			l2_pic0_name = "EC_ref";
			l2_pic1_name = "EC_hit";
		} else {
			/* This should NOT happen */
			vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
			    "cpc_getcciname failed!!\n");
			l2cache_cpc_valid = 0;
		}
	}
#endif

	vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
	    "Cache Access stress for cache_size %x line_size %x"
	    " num_loops %d\n", cache_size, line_size, num_loops);

	/* Cache hit stress */
	if ((rval = cache_hit_stress(cache_size, line_size, num_loops))
	    != 0) {
		return (rval);
	}

	/* Cache miss stress */
	if ((rval = cache_miss_stress(cache_size, line_size,
			set_associativity, num_loops))
	    != 0) {
		return (rval);
	}

	/* March testing to flush out stuck at and coupling faults */
	if ((rval = cache_march_stress1(cache_size, line_size, num_loops))
	    != 0) {
		return (rval);
	}

	/* March testing to flush out stuck at and coupling faults */
	if ((rval = cache_march_stress2(cache_size, num_loops))
	    != 0) {
		return (rval);
	}

	/*
	 * 17N March test writing to one complete cache line at a
	 * time. The test uses ldx/stx commands for writing and
	 * reading. There is not checking in the subtest. Depends on
	 * ECC checking to catch the failures.
	 */
	if ((rval = cache_march_stress3(cache_size, line_size, num_loops))
		!= 0) {
		return (rval);
	}

	/* Random swap testing */
	if ((rval = crand(cache_size)) != 0) {
		return (rval);
	}

	/*
	 * Random offset testing
	 */

	/* Alloc and lock the memory */
	buf = (ulong_t *)memalign((size_t)pagesize,
	    (size_t)cache_size);
	if (buf == NULL) {
		return (ENOMEM);
	}
	lock_status = mlock((void *)buf, cache_size);
	if (lock_status) {
		vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
		    "cache_miss mlock failed!!\n");
	}

	if ((rval = cache_rand_off_stress(buf, cache_size, num_loops))
	    != 0) {
		return (rval);
	}

	/* unlock(if lock was successful) and free the memory */
	if (lock_status) {
		lock_status = munlock((void *)buf, cache_size);
		if (lock_status) {
			/* Log an Fatal error? message */
			vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
			    "cache_miss munlock failed!!\n");
		}
	}
	free(buf);

	return (rval);
}
/*
 * Copyright 2002 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident "%W% %E% SMI"

/*
 * This file contains test loops for stressing the Level 2 data cache.
 * It currently contains the following :
 * 1. Hit Loops
 *	1.1 Read hits
 *		Write (first write to line miss, followed by hits in line)
 *		Read in increments of line size(all hits to L2)
 *		Write in increments of line size(all hits to L2)
 *	1.2 Write hits
 *		Write the complete buffer of size equal to cache size. Repeat
 *		This is done for ulong and double data types.
 *	1.3 50% read hits
 *		Read in increments of line size / 2. First read will be a miss
 *		followed by a hit.
 *	1.4 Derived data type hits
 *		Write and Write-Read-Verify loops with a derived datatype
 *		This accesses the L2 cache with different allignments.
 *
 * 2. Miss Loops
 *	2.1. Write miss loop
 *	2.2. Read miss loop
 *	2.3. Write miss read loop
 *	2.4. Read miss write loop
 *
 *
 * 3. Set Miss Loops  : Access all lines for the set before accessing next
 *	set. Operations tested are :
 *	  3.1. Write set miss loop
 *	  3.2. Read set miss loop
 *	  3.3. Write miss Read loop
 *	  3.4. Read miss Write loop
 *
 *
 * 4. Random offset loops : Test loops which access a buffer at random
 *    locations. The operations for random access loop are :
 *	 4.1 Write
 *	 4.2 Write-Read-Verify
 *	 4.3 Read
 *
 * TBD :
 *	basic_dtype_l2cachehit_stress : datatypes char, short, int
 *
 */

#include	<sys/types.h>
#include	<sys/errno.h>
#include	<sys/time.h>
#include	<signal.h>
#include	<stdlib.h>
#include	<sys/mman.h>
#include	<unistd.h>
#include	<stdio.h>
#include	<testinfo.h>
#include	<note.h>
#include	"l2_util.h"
#include	"l2cache.h"

#define	L2_CACHE_TEST_NAME	"l2cache"

/*
 * Defination for derived data type for derived data type tests
 */
#pragma pack(1)
typedef struct derived_dtype {
	char	char_elem;
	short	short_elem;
	int	int_elem;
	ulong_t	long_elem;
} derived_dtype_s;
#pragma pack()
typedef derived_dtype_s *derived_dtype_t;

/* Pagesize for this system */
static size_t pagesize;

static int basic_dtype_l2cachehit_stress(size_t bufsize,
    int number_loops);
static int l2cachehit_wrrdwr(size_t bufsize, size_t line_size,
    int num_loops);
static int derived_dtype_write_read_verify(derived_dtype_t buf,
    size_t bufsize, int num_loops, derived_dtype_t val);
static int derived_dtype_write(derived_dtype_t buf,
    size_t bufsize, int num_loops, derived_dtype_t val);
static int cache_writemiss(ulong_t *buf, size_t bufsize,
    size_t row_size, int num_loops, ulong_t write_val);
static int cache_readmiss(ulong_t *buf, size_t bufsize,
    size_t row_size, int num_loops);
static int cache_readmiss_write(ulong_t *buf, size_t bufsize,
    size_t row_size, int num_loops, ulong_t write_val);
static int cache_writemiss_read(ulong_t *buf, size_t bufsize,
    size_t row_size, int num_loops, ulong_t write_val);
static int cache_sets_trash(ulong_t *buf, size_t cache_size,
    size_t line_size, int set_associativity, int num_loops,
    ulong_t write_val, int operation);
static int rand_off_wr_loop(ulong_t *buf, size_t bufsize,
    size_t numloops, ulong_t write_val);
static int rand_off_wrrdvrfy_loop(ulong_t *buf, size_t bufsize,
    int numloops, ulong_t write_val);
static int rand_off_rd_loop(ulong_t *buf, size_t bufsize,
    int numloops);
static	int cache_march_stress1(size_t cache_size,
    size_t line_size, int num_loops);
static  int cache_march_stress2(size_t cache_size, int num_loops);

/*
 * Optimization for all test loops to be minimal.
 */
#pragma	opt 1	(basic_dtype_l2cachehit_stress, l2cachehit_wrrdwr)
#pragma	opt 1	(derived_dtype_write_read_verify, derived_dtype_write)
#pragma	opt 1	(cache_writemiss, cache_readmiss, cache_writemiss_read)
#pragma	opt 1	(cache_sets_trash, rand_off_wr_loop)
#pragma	opt 1	(rand_off_wrrdvrfy_loop, rand_off_rd_loop)
#pragma	opt 1	(cache_march_stress1, cache_march_stress2)

/* flags to differentiate the cache sets trash loops */
#define		LOOP_BASE	0
#define		WR_MISS		((LOOP_BASE)+1)
#define		RD_MISS		((LOOP_BASE)+2)
#define		WR_MISS_RD	((LOOP_BASE)+3)
#define		RD_MISS_WR	((LOOP_BASE)+4)

/* Cache random swapping function */
extern  int crand(ulong_t);
extern  int cache_march_stress3(size_t, size_t, int);

#ifdef L2CACHE_PERF_MONITOR
/*
 * Processor event string for DC reads and write statistics
 * This strings are supported on Ultra Sparc-I,II,III
 * TBD : Initialize these strings dynamically, based on the
 * processor type.
 */
static char *l2wr_perf_string;
static char *l2rd_perf_string;

/* Event string for display only */
static char *l2_pic0_name;
static char *l2_pic1_name;

/*
 * l2cache_cpc_valid - Set if CPC is valid for this processor
 * l2cache_perf_valid- Set if cpc_setup and  get_sample for
 *		before was obtained correctly
 * l2cache_cpc_ver - CPU CPC version
 */
static int l2cache_cpc_valid = 0;
static int l2cache_perf_valid;
static int l2cache_cpc_ver;

#define	L2CACHE_CPC_SETUP(perf_string)	\
	if (l2cache_cpc_valid) {	\
		l2cache_perf_valid =	\
		    (l2cache_cpc_setup((perf_string),	\
		    l2cache_cpc_ver) != -1);	\
		if (l2cache_perf_valid &&	\
		    cpc_take_sample(&before) == -1) {	\
			l2cache_perf_valid = 0;	\
			vts_message(	\
			    NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,	\
			    "In cpc_take_sample(before)\n");	\
		}	\
	}

#define	L2CACHE_CPC_AFTER(msg_prefix)	\
	if (l2cache_cpc_valid && l2cache_perf_valid) {	\
		if (cpc_take_sample(&after) == -1) {	\
			vts_message(	\
			    NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,	\
			    "In cpc_take_sample(after)\n");	\
		} else {	\
			cpc_event_diff(&diff, &after, &before);	\
			diff.ce_pic[1] = after.ce_pic[1] -	\
				before.ce_pic[1];	\
			diff.ce_hrt = after.ce_hrt - before.ce_hrt;	\
			vts_message(	\
			    NO_EXIT, VERBOSE, L2_CACHE_TEST_NAME, 0,	\
			    "L2 cache %s%s %llu, %s %llu\n",	\
			    (msg_prefix), l2_pic0_name,	\
			    diff.ce_pic[0], l2_pic1_name,	\
			    diff.ce_pic[1]);	\
		}	\
	}
#else
#define	L2CACHE_CPC_SETUP(perf_string)
#define	L2CACHE_CPC_AFTER(msg_prefix)
#endif	/* L2CACHE_PERF_MONITOR */


/*
 * L2 cache hit loops
 * --------------------
 * L2 Cache Write/Write-Read-Verify cache hit test loops.
 * These test loops are common for all basic data types.  These test
 * loops continue for the number of test loops specified as an
 * argument.
 *
 * The general format of test loop are as below :
 *	for num_loops do
 *		Continue till buffer limit is reached
 *			<Test-operation>
 *
 * These defines SHOULD be used as inline code. The following
 * SHOULD be passed as parameters :
 *	1. datatype	- Any of the basic datatype
 *	2. datatype	*buf
 *	4. int		ret_val : Return Value
 *	5. datatype	val : Value to write / Read verify
 *	6. int		num_loops
 *
 * ASSUMPTION : Only called once in a function for a datatype.
 *              Multiple calls WILL result in compile errors
 *
 * Test Loops
 *		Write
 *		Write-Read-Verify
 */

#define	l2cachehit_write(datatype, buf, ret_val,	\
    val, num_loops) \
	{ \
		int limit = sizeof (buf) / sizeof (##datatype);	\
		int direction = 1;	\
##datatype *localbuf = (buf);	\
		int cnt = num_loops;	\
		while (cnt--) { \
			int index = limit;	\
			while (index--) {	\
				*localbuf = (val);	\
				localbuf += direction;	\
			}	\
			direction *= -1;	\
		}	\
		(ret_val) = 0;	\
	}

#define	l2cachehit_write_read_verify(datatype, buf, \
    ret_val, val, num_loops) \
	{ \
##datatype	*localbuf = (buf);	\
		int direction = 1;	\
		int limit = sizeof (buf) / sizeof (##datatype);	\
		int cnt = num_loops;	\
		while (cnt--) { \
			int index = limit;	\
			while (index--) {	\
					*localbuf = val;	\
					if (*localbuf != val) {	\
						(ret_val) = -1;	\
						goto wr_rd_vrfy_##datatype; \
				}	\
				localbuf += direction;	\
			}	\
			direction *= -1;	\
		}	\
		ret_val = 0; \
wr_rd_vrfy_##datatype:	\
		;	\
	}

/*
 * Perform hit test loops for all REQUIRED basic data types,
 * for the specified buffer size.
 *
 * Currently this supports ulong and double
 */
static int
basic_dtype_l2cachehit_stress(size_t bufsize, int number_loops)
{
	void	*buf;
	ulong_t	*ulong_buf;
	ulong_t	write_ulong = (ulong_t)lrand48();
	double	*double_buf;
	double	write_double = drand48();
	int	rc = 0;
	int lock_status = 0;

	/* Allocate and lock a buffer for all loops */
	buf = (ulong_t *)memalign(pagesize, (size_t)bufsize);
	if (buf == NULL) {
		return (ENOMEM);
	}
	lock_status = mlock((void *)buf, bufsize);
	if (lock_status) {
		/* Log an error message(Notice) that locking failed */
		vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
		    "basic_dtype, ulong mlock failed!!\n");
	}
	ulong_buf = (ulong_t *)buf;
	double_buf = (double *)buf;

	/* Do basic data type test  loops for ulong_t */
	l2cachehit_write(ulong_t, ulong_buf, rc,
	    write_ulong, number_loops);
	if (rc)
		goto basic_dtype_exit;

	write_ulong *= 2;	/* Change write_ulong */

	l2cachehit_write_read_verify(ulong_t, ulong_buf,
	    rc, write_ulong, number_loops);
	if (rc)
		goto basic_dtype_exit;

	/* Do basic data type test  loops for double */
	l2cachehit_write(double, double_buf,
	    rc, write_double, number_loops);
	if (rc)
		goto basic_dtype_exit;

	write_double *= 2;
	l2cachehit_write_read_verify(double, double_buf,
	    rc, write_double, number_loops);
	if (rc)
		goto basic_dtype_exit;

basic_dtype_exit:
	/* unlock the memory */
	if (lock_status) {
		lock_status = munlock((void *)double_buf, bufsize);
		if (lock_status) {
			/* Log an Fatal error? message */
			vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME,
			    0, "basic_dtype munlock failure!!\n");
		}
	}

	free(double_buf);
	return (rc);
}

/*
 * This is a Read miss loop followed by  Read hit loop and Write hit
 * loop. This is currently supported only for ulong_t datatype. There
 * apparently does not seem to be any value add by extending this for
 * other datatypes.
 */
static int
l2cachehit_wrrdwr(size_t bufsize, size_t line_size, int num_loops)
{
	ulong_t	write_val;
	ulong_t	*buf;
	int	lock_status = 0;
	ulong_t	limit;
	int	loop_cnt;
	ulong_t	i;

	write_val = (ulong_t)lrand48();
	limit = bufsize / sizeof (ulong_t);
	loop_cnt = num_loops;

	/* Allocate and lock the buffer for test loop */
	buf = (ulong_t *)memalign((size_t)pagesize,
	    (size_t)bufsize);
	if (buf == NULL) {
		return (ENOMEM);
	}
	lock_status = mlock((void *)buf, bufsize);
	if (lock_status) {
		/* Log an error message(Notice) that locking failed */
		vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
		    "l2cachehit_wrrdwr, mlock failed!!\n");
	}

	/*
	 * 1. Fill up the buffer by writes, First write in line will be a miss,
	 *    the rest will be hits. As Dcache is not write allocate, all the
	 *    data will be directly written to L2 cache.
	 * 2. Read in increments of Ecache line size, This will be hits to L2,
	 *    as the writes have filled in the buffer.
	 * 3. Write in increments of line_size. This will also be hits.
	 */
	for (i = 0; i < limit; i++) {
		buf[i] = write_val;
	}
	while (loop_cnt--) {
		for (i = 0; i < limit; i += line_size) {
			write_val += buf[i];
		}
	}
	loop_cnt = num_loops;
	while (loop_cnt--) {
		for (i = 0; i < limit; i += line_size) {
			buf[i] = write_val;
		}
	}

	/* unlock the memory */
	if (lock_status) {
		lock_status = munlock((void *)buf, bufsize);
		if (lock_status) {
			/* Log an Fatal error? message */
			vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME,
			    0, "l2cachehit_wrrdwr: munlock failed!!\n");
		}
	}
	free(buf);

	return (0);
}

/*
 * Derived datatype tests. This contains Write & Write-Read-Verify
 */
static int
derived_dtype_write_read_verify(derived_dtype_t buf, size_t bufsize,
    int num_loops, derived_dtype_t val)
{
	ulong_t limit = bufsize / sizeof (derived_dtype_s);
	int direction = 1;

	while (num_loops--) {
		ulong_t index = limit;

		while (index--) {
			buf->char_elem = val->char_elem;
			buf->short_elem = val->short_elem;
			buf->int_elem = val->int_elem;
			buf->long_elem = val->long_elem;

			if (buf->char_elem != val->char_elem ||
			    buf->short_elem != val->short_elem ||
			    buf->int_elem != val->int_elem ||
			    buf->long_elem != val->long_elem) {
				return (-1);
			}
			if (index && direction == 1)
				buf++;
			else if (index && direction == 0)
				buf--;
		}
		if (direction)
			direction = 0;
		else
			direction = 1;
	}
	return (0);
}

static int
derived_dtype_write(derived_dtype_t buf, size_t bufsize, int num_loops,
    derived_dtype_t val)
{
	ulong_t limit = bufsize / sizeof (derived_dtype_s);
	int direction = 1;

	while (num_loops--) {
		ulong_t index = limit;

		while (index--) {
			buf->char_elem = val->char_elem;
			buf->short_elem = val->short_elem;
			buf->int_elem = val->int_elem;
			buf->long_elem = val->long_elem;

			if (index && direction == 1)
				buf++;
			else if (index && direction == 0)
				buf--;
		}
		if (direction)
			direction = 0;
		else
			direction = 1;
	}
	return (0);
}

static int
derived_dtype_test(size_t cache_size, int num_loops)
{
	derived_dtype_t buf;
	derived_dtype_s val;
	int lock_status;
	int rc = 0;

	/* Allocate and lock buffer for test loop */
	buf = (derived_dtype_t)memalign((size_t)pagesize,
	    (size_t)cache_size);
	if (buf == NULL) {
	    return (ENOMEM);
	}
	lock_status = mlock((void *)buf, cache_size);
	if (lock_status) {
		/* Log an error message(Notice) that locking failed */
		vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
		    "derived_dtype mlock failed!!\n");
	}

	if ((rc = derived_dtype_write(buf, cache_size, num_loops,
	    &val)) != 0) {
		goto derived_test_exit;
	}
	if ((rc = derived_dtype_write_read_verify(buf, cache_size, num_loops,
	    &val)) != 0) {
		goto derived_test_exit;
	}

derived_test_exit:
	if (lock_status) {
		/* unlock the memory */
		lock_status = munlock((void *)buf, cache_size);
		if (lock_status) {
			/* Log an Fatal error? message */
			vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
			    "derived_dtype munlock failed!!\n");
		}
	}
	free(buf);

	return (rc);
}

static int cache_readmiss(ulong_t *buf, size_t bufsize,
    size_t row_size, int num_loops);

/*
 * The main cache hit stress function.
 *
 * The line size and cache size are passed to this.
 */
static int
cache_hit_stress(size_t cache_size, size_t line_size, int num_loops)
{
	int rval = 0;
	ulong_t *buf;
	int lock_status;
	int bufsize;

	/*
	 * Do test loop(s) for combined data types. This loops may give cache
	 * access rates lower because of the some compare loops.
	 */
	if ((rval = basic_dtype_l2cachehit_stress(cache_size, num_loops))
	    != 0) {
		if (rval != ENOMEM)
			return (rval);
	}

	/*
	 * Do the test loop for write-read-Write hits - TBD
	 */
	if ((rval = l2cachehit_wrrdwr(cache_size, line_size, num_loops))
	    != 0) {
		if (rval != ENOMEM)
			return (rval);
	}

	/* Do the derived data type hit loop */
	if ((rval = derived_dtype_test(cache_size, num_loops)) != 0) {
		if (rval != ENOMEM)
			return (rval);
	}

	/*
	 * Read miss loop with increment of linesize / 2, this shall
	 * cause 50% hit loop.
	 */
	bufsize = cache_size;
	buf = (ulong_t *)memalign((size_t)pagesize,
	    (size_t)bufsize);
	if (buf == NULL) {
		return (0);
	}
	lock_status = mlock((void *)buf, bufsize);
	if (lock_status) {
		vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
		    "cache_miss mlock failed!!\n");
	}

	rval = cache_readmiss(buf, bufsize, line_size / 2, num_loops);

	/* unlock(if lock was successful) and free the memory */
	if (lock_status) {
		lock_status = munlock((void *)buf, bufsize);
		if (lock_status) {
			/* Log an Fatal error? message */
			vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
			    "cache_miss munlock failed!!\n");
		}
	}
	free(buf);

	if (rval != ENOMEM)
		return (rval);
	else
		return (0);
}

/*
 * Cache write miss loops.
 *
 * These loops take row_size as a parameter. This can be either line size
 * or multiple of it. Accessing a buffer in incremnts of line size causes
 * a miss. Accessing buffer with a different increment causes miss in a
 * different manner.
 */
static int
cache_writemiss(ulong_t *buf, size_t bufsize, size_t row_size, int num_loops,
    ulong_t write_val)
{
	ulong_t	*ptr0;
	ulong_t	num_row, num_clmn;	/* Number of Rows and Columns */
	ulong_t	i, j;	/* Loop variables */

	num_clmn = row_size / sizeof (ulong_t);
	num_row = bufsize / row_size;

	while (num_loops--) {
		for (j = 0; j < num_clmn; j++) {
			ptr0 = buf + j;
			for (i = 0; i < num_row; i++) {
				*ptr0 = write_val;
				ptr0 += num_clmn;
			}
		}
		write_val *= 2;	/* Change the write value */
	}

	return (0);
}

static int
cache_readmiss(ulong_t *buf, size_t bufsize, size_t row_size, int num_loops)
{
	volatile ulong_t *ptr0;
	ulong_t	num_row, num_clmn;	/* Number of Rows and Columns */
	ulong_t	i, j;	/* Loop variables */
	volatile ulong_t sum;

	sum = (ulong_t)lrand48();

	num_clmn = row_size / sizeof (ulong_t);
	num_row = bufsize / row_size;

	while (num_loops--) {
		for (j = 0; j < num_clmn; j++) {
			ptr0 = buf + j;
			for (i = 0; i < num_row; i++) {
				sum += *ptr0;
				ptr0 += num_clmn;
			}
		}
	}
	*buf = sum;

	return (0);
}

static int
cache_readmiss_write(ulong_t *buf, size_t bufsize, size_t row_size,
    int num_loops, ulong_t write_val)
{
	volatile ulong_t	*ptr0;
	ulong_t	num_row, num_clmn;	/* Number of Rows and Columns */
	ulong_t	i, j;	/* Loop variables */

	num_clmn = row_size / sizeof (ulong_t);
	num_row = bufsize / row_size;

	while (num_loops--) {
		for (j = 0; j < num_clmn; j++) {
			ptr0 = buf + j;
			for (i = 0; i < num_row; i++) {
				*ptr0 *= write_val;
				ptr0 += num_clmn;
			}
		}
	}

	return (0);
}

static int
cache_writemiss_read(ulong_t *buf, size_t bufsize, size_t row_size,
    int num_loops, ulong_t write_val)
{
	ulong_t	*ptr0;
	ulong_t	num_row, num_clmn;	/* Number of Rows and Columns */
	ulong_t	i, j;	/* Loop variables */
	volatile ulong_t sum;

	sum = (ulong_t) lrand48(); 	/* Init sum for lint */
	num_clmn = row_size / sizeof (ulong_t);
	num_row = bufsize / row_size;

	while (num_loops--) {
		for (j = 0; j < num_clmn; j++) {
			ptr0 = buf + j;
			for (i = 0; i < num_row; i++) {
				*ptr0 = write_val;
				/* Change the write value */
				write_val *= 2;
				sum += *ptr0;
				ptr0 += num_clmn;
			}
		}
	}
	*buf = sum;	/* Use it to avoid optimization */

	return (0);
}

static int
cache_sets_trash(ulong_t *buf, size_t cache_size, size_t line_size,
    int set_associativity, int num_loops, ulong_t write_val,
    int operation)
{

	size_t num_of_sets;
	size_t jump_by_line;
	size_t jump_by_set_next_line;
	ulong_t *ptr, *buf_line0;
	volatile ulong_t sum = 0;
	int i, j, count;

	buf_line0 = buf;
	num_of_sets = (cache_size / (line_size * set_associativity));
	jump_by_line = line_size / (size_t)sizeof (ulong_t);
	/* Jump next line in the same set */
	jump_by_set_next_line = (line_size * num_of_sets) /
	    (size_t)sizeof (ulong_t);

	/*
	 * The inner loops (while and for) will trash all lines
	 * in the same set for for count nymber of times
	 */
	for (i = 0; i < num_of_sets; i++) {
	    count  = num_loops;
	    while (count--) {
			ptr = buf_line0;
			for (j = 0; j < set_associativity; j++) {
				switch (operation) {
					case WR_MISS:
						*ptr = write_val;
						ptr += jump_by_set_next_line;
						/* Change write value */
						write_val *= 2;
						break;

					case RD_MISS:
						sum += *ptr;
						ptr += jump_by_set_next_line;
						break;

					case WR_MISS_RD:
						*ptr = write_val;
						/* Change write value */
						write_val *= 2;
						sum = *ptr;
						ptr += jump_by_set_next_line;
						break;

					case RD_MISS_WR:
						(*ptr) *= write_val;
						ptr += jump_by_set_next_line;
						break;
				}
			}
	    }
		buf_line0 += jump_by_line;
	}
	return (0);
}


/*
 * The main cache miss stress  function. This contains the following test loops
 * 1. Matrix with row size equal to line size
 *
 * The test loop is as below :
 *	ulong_t *ptr0 = buf of size  bufsize = 2 * cache_size
 *	num_clmn = row_size / sizeof(ulong)
 *	num_row = bufsize / rowsize
 *	for (j = 0; j < num_clmn; j++)
 *		ptr0 = buf + j;
 *		for (i = 0; i < num_row; i++)
 *			<OPERATE> *buf
			buf += num_clmn
 *
 * The inner loop jumps in terms of rows. This will cause an element of the
 * matrix with offset = row_size(either line size or cache_size), thus
 * causing a miss.
 *
 * The buffer size is set to 2 times cache size because :
 *	1. A end of inner loop the first part of the buffer(size = cachesize)
 *	   is out of the cache. The access to the start of the buffer will be
 *	   a miss.
 *	2. The same loop can be repeated N times causing same miss pattern. If
 *	   the buffer size is equal to cache size, the second iteration of the
 *	   loop will be a hit, as nothing else has replaced it(Assuming
 *	   standalone operation of the test loop.
 *	3. Assuming that the number of loops is a multiple of two, the same
 *	   buffer can be used to test different operations(write, read, etc.).
 *
 * Operations supported : Write miss, Read miss, Read miss followed by write &
 *	Write miss followed by read.
 */
static int
cache_miss_stress(size_t cache_size, size_t line_size,
	int set_associativity, int num_loops)
{
	ulong_t *buf;
	int bufsize = cache_size * 2;
	int lock_status;
	ulong_t write_value;
	int rc = 0;
#ifdef L2CACHE_PERF_MONITOR
	cpc_event_t before, after, diff;
#endif	/* L2CACHE_PERF_MONITOR */

	/* Alloc and lock the memory */
	buf = (ulong_t *)memalign((size_t)pagesize,
	    (size_t)bufsize);
	if (buf == NULL) {
		return (ENOMEM);
	}
	lock_status = mlock((void *)buf, bufsize);
	if (lock_status) {
		vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
		    "cache_miss mlock failed!!\n");
	}

	write_value = (ulong_t)lrand48();

	/*
	 * Line size miss
	 */
	L2CACHE_CPC_SETUP(l2wr_perf_string);
	if ((rc = cache_writemiss(buf, bufsize, line_size, num_loops,
	    write_value)) != 0) {
		goto cache_miss_exit;
	}
	L2CACHE_CPC_AFTER("Write miss loop : ");

	L2CACHE_CPC_SETUP(l2rd_perf_string);
	if ((rc = cache_readmiss(buf, bufsize, line_size, num_loops))
	    != 0) {
		goto cache_miss_exit;
	}
	L2CACHE_CPC_AFTER("Read miss loop : ");

	L2CACHE_CPC_SETUP(l2rd_perf_string);
	if ((rc = cache_readmiss_write(buf, bufsize, line_size, num_loops,
	    write_value)) != 0) {
		goto cache_miss_exit;
	}
	L2CACHE_CPC_AFTER("Read miss write loop : ");

	L2CACHE_CPC_SETUP(l2wr_perf_string);
	if ((rc = cache_writemiss_read(buf, bufsize, line_size, num_loops,
	    write_value)) != 0) {
		goto cache_miss_exit;
	}
	L2CACHE_CPC_AFTER("Write miss read loop : ");


	/*
	 * Trash all the sets in a cache
	 * If set associativity is greater to 1, then trash
	 * all the sets in cache. Otherwise its a direct-mapped
	 * cache in which each line size is equivalent to one set
	 * for which we have loops above.
	 */
	if (set_associativity <= 1) {
		return (rc);
	}

	L2CACHE_CPC_SETUP(l2wr_perf_string);
	if ((rc = cache_sets_trash(buf, bufsize, line_size,
	    set_associativity, num_loops, write_value,
	    WR_MISS)) != 0) {
		goto cache_miss_exit;
	}
	L2CACHE_CPC_AFTER("sets write miss loop : ");

	L2CACHE_CPC_SETUP(l2rd_perf_string);
	if ((rc = cache_sets_trash(buf, bufsize, line_size,
	    set_associativity, num_loops, write_value,
	    RD_MISS)) != 0) {
		goto cache_miss_exit;
	}
	L2CACHE_CPC_AFTER("sets read miss loop : ");

	L2CACHE_CPC_SETUP(l2wr_perf_string);
	if ((rc = cache_sets_trash(buf, bufsize, line_size,
	    set_associativity, num_loops, write_value,
	    WR_MISS_RD)) != 0) {
		goto cache_miss_exit;
	}
	L2CACHE_CPC_AFTER("sets write miss read loop : ");

	L2CACHE_CPC_SETUP(l2rd_perf_string);
	if ((rc = cache_sets_trash(buf, bufsize, line_size,
	    set_associativity, num_loops, write_value,
	    RD_MISS_WR)) != 0) {
		goto cache_miss_exit;
	}
	L2CACHE_CPC_AFTER("sets read miss write loop : ");



cache_miss_exit:
	/* unlock(if lock was successful) and free the memory */
	if (lock_status) {
		lock_status = munlock((void *)buf, bufsize);
		if (lock_status) {
			/* Log an Fatal error? message */
			vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
			    "cache_miss munlock failed!!\n");
		}
	}
	free(buf);

	return (rc);
}

/*
 * Random offset access test loops:
 *		These test loops exercise the cache in a random manner,
 *		causing cache hits and misses in an random manner.
 *
 *		Currently writeonly, readonly and wr-read-verify loops
 *		are supported. Write followed by Read verify cannot be
 *		tested as write at random locations does NOT guarantee
 *		that the complete buffer is written. The read-verify
 *		at a location which was not written WILL fail!!!
 *		Read verify can be done with fully written buffer.
 */
static int
rand_off_wr_loop(ulong_t *buf, size_t bufsize, size_t numloops,
	ulong_t write_val)
{
	ulong_t *rand_buf;
	size_t	rand_bufsz, i;

	/*
	 * Fill the random buffer with random offsets. The offsets
	 * are limited by ]0 - (bufsize -1)].
	 */
	rand_buf = l2_get_randbuf(bufsize, &rand_bufsz);
	if (rand_buf == NULL) {
		return (ENOMEM);
	}

	while (numloops--) {
		for (i = 0; i < rand_bufsz; i++) {
			buf[rand_buf[i]] = write_val;
		}
		write_val *= 2;
	}
	free(rand_buf);
	return (0);
}

static int
rand_off_wrrdvrfy_loop(ulong_t *buf, size_t bufsize, int numloops,
	ulong_t write_val)
{
	ulong_t *rand_buf;
	size_t	rand_bufsz, i;

	/*
	 * Fill the random buffer with random offsets. The offsets
	 * are limited by ]0 - (bufsize -1)].
	 */
	rand_buf = l2_get_randbuf(bufsize, &rand_bufsz);
	if (rand_buf == NULL) {
		return (ENOMEM);
	}

	while (numloops--) {
		for (i = 0; i < rand_bufsz; i++) {
			buf[rand_buf[i]] = write_val;
			if (buf[rand_buf[i]] != write_val) {
				return (-1);
			}
		}
		write_val *= 2;
	}
	free(rand_buf);
	return (0);
}

static int
rand_off_rd_loop(ulong_t *buf, size_t bufsize, int numloops)
{
	ulong_t *rand_buf;
	size_t	rand_bufsz, i;
	volatile ulong_t read_sum;

	/*
	 * Fill the random buffer with random offsets. The offsets
	 * are limited by ]0 - (bufsize -1)].
	 */
	rand_buf = l2_get_randbuf(bufsize, &rand_bufsz);
	if (rand_buf == NULL) {
		return (ENOMEM);
	}
	read_sum = rand_bufsz;

	while (numloops--) {
		for (i = 0; i < rand_bufsz; i++) {
			read_sum += buf[rand_buf[i]];
		}
	}
	free(rand_buf);
	return (0);
}

static int
cache_rand_off_stress(ulong_t *buf, size_t bufsize, int numloops)
{
	int rval;
	ulong_t write_val = (ulong_t)lrand48();

	bufsize /= sizeof (ulong_t);
	if ((rval = rand_off_wr_loop(buf, bufsize, numloops,
	    write_val)) != 0) {
		return (rval);
	}

	write_val = (ulong_t)lrand48();
	if ((rval = rand_off_wrrdvrfy_loop(buf, bufsize, numloops,
	    write_val)) != 0) {
		return (rval);
	}

	if ((rval = rand_off_rd_loop(buf, bufsize, numloops))
	    != 0) {
		return (rval);
	}

	return (0);
}

/*
 * March test to flush out the stuck at and coupling failures
 * in the Ecache.
 */
static	int
cache_march_stress1(size_t cache_size, size_t line_size, int num_loops)
{
	ulong_t	*buf;
	int bufsize	= cache_size;
	int	ret = 0;
	int	lock_status = 0;

	ulong_t		pattern, cpattern;
	ulong_t 	*ptr0;
	ulong_t 	num_row, num_clmn; /* Num. of rows and columns */
	ulong_t 	i, j, sum, patnum;   /* Loop variables */
	int		arr_size;

	ulong_t	pats[] = {
		0xFFFFFFFFFFFFFFFFL, 0xAAAAAAAAAAAAAAAAL,
		0xfefefefefefefefeL, 0xfdfdfdfdfdfdfdfdL,
		0xfbfbfbfbfbfbfbfbL, 0xf7f7f7f7f7f7f7f7L,
		0xefefefefefefefefL, 0xdfdfdfdfdfdfdfdfL,
		0x7f7f7f7f7f7f7f7fL, 0xf0f0f0f0f0f0f0f0L,
		0x3333333333333333L, 0xccccccccccccccccL
	};

	arr_size = sizeof (pats) / sizeof (ulong_t);
	vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
	    "L2 March test with row striding\n");

	vts_message(0, VERBOSE, NULL, MARCH_TEST_ROW_STRIDE_START);

	/* Alloc and lock the memory */
	buf = (ulong_t *)memalign((size_t)pagesize, (size_t)bufsize);
	if (buf == NULL) {
		return (ENOMEM);
	}
	lock_status = mlock((void *)buf, bufsize);
	num_clmn = line_size / sizeof (ulong_t);
	num_row = bufsize / line_size;

	while (num_loops--) {
		patnum = l2_getrandom(arr_size + 1);
		if (patnum < arr_size) {
			pattern = pats[patnum];
			cpattern = ~pattern;
		} else {
			pattern = (ulong_t)lrand48();
			cpattern = ~pattern;
		}

		/* Doing March test with word size patterns */
		/* Step 1: Write to the entire E$ one word at a time */
		for (j = 0; j < num_clmn; j++) {
			ptr0 = buf + j;
			for (i = 0; i < num_row; i++) {
				*ptr0 = pattern;
				ptr0 += num_clmn;
			}
		}


		/* Step 2: Read the entire E$ one word at a time */
		for (j = 0; j < num_clmn; j++) {
			ptr0 = buf + j;
			for (i = 0; i < num_row; i++) {
				sum = *ptr0;
				ptr0 += num_clmn;
			}
		}

		/* Step 3: Write pattern compliment to the entire E$ */
		for (j = 0; j < num_clmn; j++) {
			ptr0 = buf + j;
			for (i = 0; i < num_row; i++) {
				*ptr0 = cpattern;
				ptr0 += num_clmn;
			}
		}

		/* Step 4: Read the entire E$ */
		for (j = 0; j < num_clmn; j++) {
			ptr0 = buf + j;
			for (i = 0; i < num_row; i++) {
				sum = *ptr0;
				ptr0 += num_clmn;
			}
		}

		/* Step 5: Write the original pattern to the entire E$ */
		for (j = 0; j < num_clmn; j++) {
			ptr0 = buf + j;
			for (i = 0; i < num_row; i++) {
				*ptr0 = pattern;
				ptr0 += num_clmn;
			}
		}

		/* Step 6: Read the entire E$ */
		for (j = 0; j < num_clmn; j++) {
			ptr0 = buf + j;
			for (i = 0; i < num_row; i++) {
				sum = *ptr0;
				if (sum != pattern) {
					ret = -1;
					goto cache_march1_exit;
				}
				ptr0 += num_clmn;
			}
		}
	}

cache_march1_exit:
	/* unlock(if lock was successful) and free the memory */
	if (lock_status) {
		lock_status = munlock((void *)buf, bufsize);
		if (lock_status) {
			/* Log an Fatal error? message */
			vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
			    "cache_miss munlock failed!!\n");
		}
	}
	vts_message(0, VERBOSE, NULL, MARCH_TEST_ROW_STRIDE_END);
	free(buf);
	return (ret);
}

/*
 * March test to flush out the stuck at and coupling failures
 * in the Ecache.
 */
static  int
cache_march_stress2(size_t cache_size, int num_loops)
{
	ulong_t *buf;
	size_t bufsize = cache_size;
	int ret = 0;
	int lock_status = 0;
	int	arr_size;

	ulong_t pattern, cpattern;
	ulong_t *ptr0;
	size_t num_words = cache_size / (size_t)sizeof (ulong_t);
	size_t j;
	volatile ulong_t sum, patnum; /* Loop variables */

	ulong_t pats[] = {
		0xFFFFFFFFFFFFFFFFL, 0xAAAAAAAAAAAAAAAAL,
		0xfefefefefefefefeL, 0xfdfdfdfdfdfdfdfdL,
		0xfbfbfbfbfbfbfbfbL, 0xf7f7f7f7f7f7f7f7L,
		0xefefefefefefefefL, 0xdfdfdfdfdfdfdfdfL,
		0x7f7f7f7f7f7f7f7fL, 0xf0f0f0f0f0f0f0f0L,
		0x3333333333333333L, 0xccccccccccccccccL
	};

	arr_size = sizeof (pats) / sizeof (ulong_t);
	vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
	    "L2 March test with column striding");

	vts_message(0, VERBOSE, NULL, MARCH_TEST_COLUMN_STRIDE_START);
	/* Alloc and lock the memory */
	buf = (ulong_t *)memalign((size_t)pagesize, (size_t)bufsize);
	if (buf == NULL) {
		return (ENOMEM);
	}
	lock_status = mlock((void *)buf, bufsize);

	while (num_loops--) {
		patnum = l2_getrandom(arr_size + 1);
		if (patnum < arr_size) {
			pattern = pats[patnum];
			cpattern = ~pattern;
		} else {
			pattern = (ulong_t)lrand48();
			cpattern = ~pattern;
		}

		/* Doing March test with word size patterns */

		/* Step 1: Write to the entire E$ one word at a time */
		for (j = 0; j < num_words; j++) {
			ptr0 = buf;
			*ptr0 = pattern;
			ptr0++;
		}

		/* Step 2: Read the entire E$ one word at a time */
		for (j = 0; j < num_words; j++) {
			ptr0 = buf;
			sum = *ptr0;
			ptr0++;
		}

		/* Step 3: Write pattern compliment to the entire E$ */
		for (j = 0; j < num_words; j++) {
			ptr0 = buf;
			*ptr0 = cpattern;
			ptr0++;
		}

		/* Step 4: Read the entire E$ */
		for (j = 0; j < num_words; j++) {
			ptr0 = buf;
			sum = *ptr0;
			ptr0++;
		}

		/* Step 5: Write the original pattern to the entire E$ */
		for (j = 0; j < num_words; j++) {
			ptr0 = buf;
			*ptr0 = pattern;
			ptr0++;
		}

		/* Step 6: Read the entire E$ and compare */
		for (j = 0; j < num_words; j++) {
			ptr0 = buf;
			sum = *ptr0;
			if (sum != pattern) {
				ret = -1;
				goto cache_march2_exit;
			}
			ptr0++;
		}
	}

cache_march2_exit:
	/* unlock(if lock was successful) and free the memory */
	if (lock_status) {
		lock_status = munlock((void *)buf, bufsize);
		if (lock_status) {
			/* Log an Fatal error? message */
			vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
			    "cache_miss munlock failed!!\n");
		}
	}
	vts_message(0, VERBOSE, NULL, MARCH_TEST_COLUMN_STRIDE_END);
	free(buf);
	return (ret);
}

/*
 * The Main cache stress function callable from the test main.
 *
 * Arguments : Number of loops
 * Returns : 0 on error, -1 on failure
 */
int
l2cache_stress(int num_loops, cache_info_t *cache_info)
{
	int rval = 0;
	size_t cache_size, line_size;
	ulong_t	*buf;
	int lock_status;
	int set_associativity;

	pagesize = (size_t)sysconf(_SC_PAGESIZE);

	cache_size = cache_info->ecache_size;
	line_size = cache_info->ecache_line_size;
	set_associativity = cache_info->ecache_associativity;

#ifdef L2CACHE_PERF_MONITOR
	if (l2_cpc_valid(&l2cache_cpc_ver, L2_CACHE_TEST_NAME,
	    L2_LIBCPC_MSG) == 0) {
		l2cache_cpc_valid = 1;
	}

	/*
	 * Initialize the performance monitering strings.
	 * US-III, US-III+ and US-IIIi all have the same
	 * performance counters.
	 *
	 * Use cpc_getcciname to get the printable description
	 * of processor  performance counter interfaces.
	 */
	if (l2cache_cpc_valid) {
		const char *cciname;

		cciname = cpc_getcciname(l2cache_cpc_ver);

		if (cciname && ((strncmp(cciname, "UltraSPARC III",
		    strlen("UltraSPARC III")) == 0) ||
		    (strncmp(cciname, "UltraSPARC III+ & IV",
		    strlen("UltraSPARC III+ & IV")) == 0))) {
			/* for US-III/US-III+/US-IV cpus */
			l2wr_perf_string = "pic0=EC_ref,pic1=EC_misses";
			l2rd_perf_string = "pic0=EC_ref,pic1=EC_misses";
			l2_pic0_name = "EC_ref";
			l2_pic1_name = "EC_misses";
		} else if (cciname) { /* UltraSParc I&II */
			l2wr_perf_string = "pic0=EC_ref,pic1=EC_hit";
			l2rd_perf_string = "pic0=EC_ref,pic1=EC_hit";
			l2_pic0_name = "EC_ref";
			l2_pic1_name = "EC_hit";
		} else {
			/* This should NOT happen */
			vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
			    "cpc_getcciname failed!!\n");
			l2cache_cpc_valid = 0;
		}
	}
#endif

	vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
	    "Cache Access stress for cache_size %x line_size %x"
	    " num_loops %d\n", cache_size, line_size, num_loops);

	/* Cache hit stress */
	if ((rval = cache_hit_stress(cache_size, line_size, num_loops))
	    != 0) {
		return (rval);
	}

	/* Cache miss stress */
	if ((rval = cache_miss_stress(cache_size, line_size,
			set_associativity, num_loops))
	    != 0) {
		return (rval);
	}

	/* March testing to flush out stuck at and coupling faults */
	if ((rval = cache_march_stress1(cache_size, line_size, num_loops))
	    != 0) {
		return (rval);
	}

	/* March testing to flush out stuck at and coupling faults */
	if ((rval = cache_march_stress2(cache_size, num_loops))
	    != 0) {
		return (rval);
	}

	/*
	 * 17N March test writing to one complete cache line at a
	 * time. The test uses ldx/stx commands for writing and
	 * reading. There is not checking in the subtest. Depends on
	 * ECC checking to catch the failures.
	 */
	if ((rval = cache_march_stress3(cache_size, line_size, num_loops))
		!= 0) {
		return (rval);
	}

	/* Random swap testing */
	if ((rval = crand(cache_size)) != 0) {
		return (rval);
	}

	/*
	 * Random offset testing
	 */

	/* Alloc and lock the memory */
	buf = (ulong_t *)memalign((size_t)pagesize,
	    (size_t)cache_size);
	if (buf == NULL) {
		return (ENOMEM);
	}
	lock_status = mlock((void *)buf, cache_size);
	if (lock_status) {
		vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
		    "cache_miss mlock failed!!\n");
	}

	if ((rval = cache_rand_off_stress(buf, cache_size, num_loops))
	    != 0) {
		return (rval);
	}

	/* unlock(if lock was successful) and free the memory */
	if (lock_status) {
		lock_status = munlock((void *)buf, cache_size);
		if (lock_status) {
			/* Log an Fatal error? message */
			vts_message(NO_EXIT, DEBUG, L2_CACHE_TEST_NAME, 0,
			    "cache_miss munlock failed!!\n");
		}
	}
	free(buf);

	return (rval);
}
