Image-CCV

 view release on metacpan or  search on metacpan

ccv-src/lib/nnc/ccv_nnc_symbolic_graph_compile.c  view on Meta::CPAN

	int tensor_symbol_info_size;
	int exec_symbol_info_size;
	int tensor_block_size;
	int sub_prep_size;
	ccv_nnc_tensor_block_t* tensor_blocks;
	ccv_nnc_tensor_symbol_info_t* tensor_symbol_info;
	ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info;
	int* dup_tensor_block_ref;
	ccv_nnc_graph_visit_t* visit;
	ccv_nnc_tensor_alloc_prep_t* alloc_prep;
	struct ccv_nnc_symbolic_graph_prep_s* p;
	struct ccv_nnc_symbolic_graph_prep_s** sub_preps; // The preps of its sub-graphs.
	// Structures that don't require to be freed after deallocation.
	ccv_nnc_graph_t* graph; // materialized graph, not managed by prep.
	ccv_nnc_tensor_arena_t* tensor_arena; // tensor arena, not managed by prep as well.
} ccv_nnc_symbolic_graph_prep_t;

static ccv_nnc_tensor_alloc_prep_t* _ccv_nnc_tensor_alloc_prep_new(const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size)
{
	// Compute how many dis-continuous buffers are needed.
	// We prefer to have several dis-continuous buffers instead of one big buffer because
	// in this way, we can rely on system memory allocators (jemalloc, tcmalloc, or CUDA's allocator)
	// to fully utilize memory.
	int i, j, k;
	ccv_array_t** alloc_dep = (ccv_array_t**)cccalloc(tensor_block_size, sizeof(ccv_array_t*));
	int allocable_tensor_size = 0, available_tensor_size = 0;
	for (i = 0; i < tensor_block_size; i++)
		if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]))
		{
			// Tensors that we need the header info.
			++available_tensor_size;
			if (!TENSOR_EXPECT_ALIAS(tensor_blocks[i]))
				// Tensors that we actually need to allocate (exclude the alias).
				++allocable_tensor_size;
		}
	ccv_sparse_matrix_t* tensor_itf = ccv_sparse_matrix_new(tensor_block_size, tensor_block_size, CCV_8U | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
	// Overlap count.
	for (i = 0; i < tensor_block_size; i++)
		for (j = i + 1; j < tensor_block_size; j++)
			if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i]) && TENSOR_EXPECT_COMPUTABLE(tensor_blocks[j]))
			{
				// If either of the tensor is const, it must interfere with each other.
				const uint8_t one = 1;
				if (TENSOR_EXPECT_CONST(tensor_blocks[i]) || TENSOR_EXPECT_CONST(tensor_blocks[j]))
					ccv_set_sparse_matrix_cell(tensor_itf, i, j, &one);
				else {
					// Otherwise, check to see if they interfere (default to yes).
					// If any of the i's head is deterministically later than j's tail
					// or any of the i's tail is deterministically earlier than j's head, they don't interfere.
					int i_hop_j = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[i], tensor_blocks[j]);
					int j_hop_i = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[j], tensor_blocks[i]);
					// It cannot be that both i can hop to j can j can hop to i.
					assert(!(i_hop_j > 0 && j_hop_i > 0));
					if (!i_hop_j && !j_hop_i)
						ccv_set_sparse_matrix_cell(tensor_itf, i, j, &one);
				}
			}
	int* oc = (int*)cccalloc(tensor_block_size, sizeof(int));
	for (i = 0; i < tensor_block_size; i++)
		for (j = 0; j < tensor_block_size; j++)
			// If these two tensors are still alive, analyze them.
			if (i != j && TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i]) && TENSOR_EXPECT_COMPUTABLE(tensor_blocks[j]))
			{
				ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(tensor_itf, ccv_min(i, j), ccv_max(i, j));
				// If their life time overlaps, compute how many tensors it overlap.
				if (cell.u8 && cell.u8[0] == 1)
					++oc[i];
			}
	int* assigned = (int*)cccalloc(tensor_block_size, sizeof(int));
	uint64_t* allocated_offset = (uint64_t*)cccalloc(tensor_block_size, sizeof(uint64_t));
	uint64_t* allocated_size = (uint64_t*)cccalloc(tensor_block_size, sizeof(uint64_t));
	int num_assigned = 0; 
	// I can do a bit optimization here to assign out const tensor first, but heck, this just works for now.
	// Allocation graph (assuming there is a source node, and a destination node, which is 0, and (tensor_block_size + 1)
	// The first channel denotes the bytes available for allocation,
	// the second channel denotes the offset available for the allocation,
	ccv_sparse_matrix_t* alloc = ccv_sparse_matrix_new(tensor_block_size + 2, tensor_block_size + 2, CCV_64S | CCV_C2, CCV_SPARSE_ROW_MAJOR, 0);
	ccv_array_t* opt = ccv_array_new(sizeof(ccv_nnc_tensor_opt_t), 1, 0);
	for (j = 0; j < allocable_tensor_size;)
	{
		// Find the one with largest overlap, and it is not assigned.
		int max_oc = 0;
		ccv_array_clear(opt);
		for (i = 0; i < tensor_block_size; i++)
			if (oc[i] >= max_oc && TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i]) && !assigned[i] && IS_PRIMARY_COMPANION(i, tensor_blocks[i]))
			{
				ccv_nnc_tensor_opt_t a = {
					.size = tensor_blocks[i].size,
					.index = i,
					.companion = -1, // If already have a designated companion, use that.
					.oc = oc[i],
				};
				if (tensor_blocks[i].companion_ref)
				{
					const int companion_ref = tensor_blocks[i].companion_ref - 1;
					a.size = ccv_max(a.size, tensor_blocks[companion_ref].size);
					a.oc += oc[companion_ref];
				}
				// In case we have a tie, take them all in the array.
				if (a.oc > max_oc)
					ccv_array_clear(opt), max_oc = a.oc;
				ccv_array_push(opt, &a);
			}
		assert(opt->rnum > 0);
		// Go through opt array, find all tensors that doesn't interfere with it, and have tensor size larger than it.
		// Push them with the "companion" into the opt array as well.
		const int rnum = opt->rnum;
		for (i = 0; i < rnum; i++)
		{
			// Copy it out, because after insertion, it may hold invalid pointer.
			ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)ccv_array_get(opt, i);
			assert(a.companion == -1);
			const int companion_ref = tensor_blocks[i].companion_ref - 1;
			for (k = 0; k < tensor_block_size; k++)
				// Find non-overlapping tensor that has larger size (of course, is unassigned and is not one with designated companion).
				if (k != a.index && !tensor_blocks[k].companion_ref && TENSOR_EXPECT_COMPUTABLE(tensor_blocks[k]) && !assigned[k] && tensor_blocks[k].size > a.size)
				{
					ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(tensor_itf, ccv_min(a.index, k), ccv_max(a.index, k));
					// Good, push to opt array.
					if (cell.u8 && cell.u8[0] == 1)
						continue;



( run in 1.279 second using v1.01-cache-2.11-cpan-5a3173703d6 )