process results from the CPAN

process

Algorithm-RectanglesContainingDot_XS

view release on metacpan or search on metacpan

 *  1                                           12 34 5
 *  2                1234 5
 *  3                                            12345
 *  4                 12345
 *
 * Adjacent pairs are merged in "grand sweeps" through the input.
 * This means, on pass 1, the records in runs 1 and 2 aren't revisited until
 * runs 3 and 4 are merged and the runs from run 5 have been copied.
 * The only cache that matters is one large enough to hold *all* the input.
 * On some platforms, this may be many times slower than smaller caches.
 *
 * The following pseudo-code uses the same basic merge algorithm,
 * but in a divide-and-conquer way.
 *
 * # merge $runs runs at offset $offset of list $list1 into $list2.
 * # all unmerged runs ($runs == 1) originate in list $base.
 * sub mgsort2 {
 *     my ($offset, $runs, $base, $list1, $list2) = @_;
 *
 *     if ($runs == 1) {
 *         if ($list1 is $base) copy run to $list2
 *         return offset of end of list (or copy)
 *     } else {
 *         $off2 = mgsort2($offset, $runs-($runs/2), $base, $list2, $list1)
 *         mgsort2($off2, $runs/2, $base, $list2, $list1)
 *         merge the adjacent runs at $offset of $list1 into $list2
 *         return the offset of the end of the merged runs
 *     }
 * }
 * mgsort2(0, $runs, $base, $aux, $base);
 *
 * For our 5 runs, the tree of calls looks like 
 *
 *           5
 *      3        2
 *   2     1   1   1
 * 1   1
 *
 * 1   2   3   4   5
 *
 * and the corresponding activity looks like
 *
 * copy runs 1 and 2 from base to aux
 * merge runs 1 and 2 from aux to base
 * (run 3 is where it belongs, no copy needed)
 * merge runs 12 and 3 from base to aux
 * (runs 4 and 5 are where they belong, no copy needed)
 * merge runs 4 and 5 from base to aux
 * merge runs 123 and 45 from aux to base
 *
 * Note that we merge runs 1 and 2 immediately after copying them,
 * while they are still likely to be in fast cache.  Similarly,
 * run 3 is merged with run 12 while it still may be lingering in cache.
 * This implementation should therefore enjoy much of the cache-friendly
 * behavior that quicksort does.  In addition, it does less copying
 * than the original mergesort implementation (only runs 1 and 2 are copied)
 * and the "balancing" of merges is better (merged runs comprise more nearly
 * equal numbers of original runs).
 *
 * The actual cache-friendly implementation will use a pseudo-stack
 * to avoid recursion, and will unroll processing of runs of length 2,
 * but it is otherwise similar to the recursive implementation.
 */

typedef struct {
    IV	offset;		/* offset of 1st of 2 runs at this level */
    IV	runs;		/* how many runs must be combined into 1 */
} off_runs;		/* pseudo-stack element */

static void 
sortsv(pTHX_ SV **base, size_t nmemb, SVCOMPARE_t cmp)
{
    IV i, run, runs, offset;
    I32 sense, level;
    int iwhich;
    register SV **f1, **f2, **t, **b, **p, **tp2, **l1, **l2, **q;
    SV **aux, **list1, **list2;
    SV **p1;
    SV * small[SMALLSORT];
    SV **which[3];
    off_runs stack[60], *stackp;
    SVCOMPARE_t savecmp = 0;

    if (nmemb <= 1) return;			/* sorted trivially */

    if (nmemb <= SMALLSORT) aux = small;	/* use stack for aux array */
    else { New(799,aux,nmemb,SV *); }		/* allocate auxilliary array */
    level = 0;
    stackp = stack;
    stackp->runs = dynprep(aTHX_ base, aux, nmemb, cmp);
    stackp->offset = offset = 0;
    which[0] = which[2] = base;
    which[1] = aux;
    for (;;) {
	/* On levels where both runs have be constructed (stackp->runs == 0),
	 * merge them, and note the offset of their end, in case the offset
	 * is needed at the next level up.  Hop up a level, and,
	 * as long as stackp->runs is 0, keep merging.
	 */
	if ((runs = stackp->runs) == 0) {
	    iwhich = level & 1;
	    list1 = which[iwhich];		/* area where runs are now */
	    list2 = which[++iwhich];		/* area for merged runs */
	    do {
		offset = stackp->offset;
		f1 = p1 = list1 + offset;		/* start of first run */
		p = tp2 = list2 + offset;	/* where merged run will go */
		t = NEXT(p);			/* where first run ends */
		f2 = l1 = POTHER(t, list2, list1); /* ... on the other side */
		t = NEXT(t);			/* where second runs ends */
		l2 = POTHER(t, list2, list1);	/* ... on the other side */
		offset = PNELEM(list2, t);
		while (f1 < l1 && f2 < l2) {
		    /* If head 1 is larger than head 2, find ALL the elements
		    ** in list 2 strictly less than head1, write them all,
		    ** then head 1.  Then compare the new heads, and repeat,
		    ** until one or both lists are exhausted.
		    **
		    ** In all comparisons (after establishing
		    ** which head to merge) the item to merge
		    ** (at pointer q) is the first operand of

sort.h view on Meta::CPAN

		    for (i = 1, run = 0 ;;) {
			if ((p = PINDEX(b, i)) >= t) {
			    /* off the end */
			    if (((p = PINDEX(t, -1)) > b) &&
				(cmp(aTHX_ *q, *p) <= sense))
				 t = p;
			    else b = p;
			    break;
			} else if (cmp(aTHX_ *q, *p) <= sense) {
			    t = p;
			    break;
			} else b = p;
			if (++run >= RTHRESH) i += i;
		    }


		    /* q is known to follow b and must be inserted before t.
		    ** Increment b, so the range of possibilities is [b,t).
		    ** Round binary split down, to favor early appearance.
		    ** Adjust b and t until q belongs just before t.
		    */

		    b++;
		    while (b < t) {
			p = PINDEX(b, (PNELEM(b, t) - 1) / 2);
			if (cmp(aTHX_ *q, *p) <= sense) {
			    t = p;
			} else b = p + 1;
		    }


		    /* Copy all the strictly low elements */

		    if (q == f1) {
			FROMTOUPTO(f2, tp2, t);
			*tp2++ = *f1++;
		    } else {
			FROMTOUPTO(f1, tp2, t);
			*tp2++ = *f2++;
		    }
		}


		/* Run out remaining list */
		if (f1 == l1) {
		       if (f2 < l2) FROMTOUPTO(f2, tp2, l2);
		} else              FROMTOUPTO(f1, tp2, l1);
		p1 = NEXT(p1) = POTHER(tp2, list2, list1);

		if (--level == 0) goto done;
		--stackp;
		t = list1; list1 = list2; list2 = t;	/* swap lists */
	    } while ((runs = stackp->runs) == 0);
	}


	stackp->runs = 0;		/* current run will finish level */
	/* While there are more than 2 runs remaining,
	 * turn them into exactly 2 runs (at the "other" level),
	 * each made up of approximately half the runs.
	 * Stack the second half for later processing,
	 * and set about producing the first half now.
	 */
	while (runs > 2) {
	    ++level;
	    ++stackp;
	    stackp->offset = offset;
	    runs -= stackp->runs = runs / 2;
	}
	/* We must construct a single run from 1 or 2 runs.
	 * All the original runs are in which[0] == base.
	 * The run we construct must end up in which[level&1].
	 */
	iwhich = level & 1;
	if (runs == 1) {
	    /* Constructing a single run from a single run.
	     * If it's where it belongs already, there's nothing to do.
	     * Otherwise, copy it to where it belongs.
	     * A run of 1 is either a singleton at level 0,
	     * or the second half of a split 3.  In neither event
	     * is it necessary to set offset.  It will be set by the merge
	     * that immediately follows.
	     */
	    if (iwhich) {	/* Belongs in aux, currently in base */
		f1 = b = PINDEX(base, offset);	/* where list starts */
		f2 = PINDEX(aux, offset);	/* where list goes */
		t = NEXT(f2);			/* where list will end */
		offset = PNELEM(aux, t);	/* offset thereof */
		t = PINDEX(base, offset);	/* where it currently ends */
		FROMTOUPTO(f1, f2, t);		/* copy */
		NEXT(b) = t;			/* set up parallel pointer */
	    } else if (level == 0) goto done;	/* single run at level 0 */
	} else {
	    /* Constructing a single run from two runs.
	     * The merge code at the top will do that.
	     * We need only make sure the two runs are in the "other" array,
	     * so they'll end up in the correct array after the merge.
	     */
	    ++level;
	    ++stackp;
	    stackp->offset = offset;
	    stackp->runs = 0;	/* take care of both runs, trigger merge */
	    if (!iwhich) {	/* Merged runs belong in aux, copy 1st */
		f1 = b = PINDEX(base, offset);	/* where first run starts */
		f2 = PINDEX(aux, offset);	/* where it will be copied */
		t = NEXT(f2);			/* where first run will end */
		offset = PNELEM(aux, t);	/* offset thereof */
		p = PINDEX(base, offset);	/* end of first run */
		t = NEXT(t);			/* where second run will end */
		t = PINDEX(base, PNELEM(aux, t)); /* where it now ends */
		FROMTOUPTO(f1, f2, t);		/* copy both runs */
		NEXT(b) = p;			/* paralled pointer for 1st */
		NEXT(p) = t;			/* ... and for second */
	    }
	}
    }
done:
    if (aux != small) Safefree(aux);	/* free iff allocated */
    return;
}

( run in 2.972 seconds using v1.01-cache-2.11-cpan-600a1bdf6e4 )