Image-CCV
view release on metacpan or search on metacpan
ccv-src/lib/nnc/ccv_nnc_symbolic_graph_compile.c view on Meta::CPAN
int tensor_symbol_info_size;
int exec_symbol_info_size;
int tensor_block_size;
int sub_prep_size;
ccv_nnc_tensor_block_t* tensor_blocks;
ccv_nnc_tensor_symbol_info_t* tensor_symbol_info;
ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info;
int* dup_tensor_block_ref;
ccv_nnc_graph_visit_t* visit;
ccv_nnc_tensor_alloc_prep_t* alloc_prep;
struct ccv_nnc_symbolic_graph_prep_s* p;
struct ccv_nnc_symbolic_graph_prep_s** sub_preps; // The preps of its sub-graphs.
// Structures that don't require to be freed after deallocation.
ccv_nnc_graph_t* graph; // materialized graph, not managed by prep.
ccv_nnc_tensor_arena_t* tensor_arena; // tensor arena, not managed by prep as well.
} ccv_nnc_symbolic_graph_prep_t;
static ccv_nnc_tensor_alloc_prep_t* _ccv_nnc_tensor_alloc_prep_new(const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size)
{
// Compute how many dis-continuous buffers are needed.
// We prefer to have several dis-continuous buffers instead of one big buffer because
// in this way, we can rely on system memory allocators (jemalloc, tcmalloc, or CUDA's allocator)
// to fully utilize memory.
int i, j, k;
ccv_array_t** alloc_dep = (ccv_array_t**)cccalloc(tensor_block_size, sizeof(ccv_array_t*));
int allocable_tensor_size = 0, available_tensor_size = 0;
for (i = 0; i < tensor_block_size; i++)
if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]))
{
// Tensors that we need the header info.
++available_tensor_size;
if (!TENSOR_EXPECT_ALIAS(tensor_blocks[i]))
// Tensors that we actually need to allocate (exclude the alias).
++allocable_tensor_size;
}
ccv_sparse_matrix_t* tensor_itf = ccv_sparse_matrix_new(tensor_block_size, tensor_block_size, CCV_8U | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
// Overlap count.
for (i = 0; i < tensor_block_size; i++)
for (j = i + 1; j < tensor_block_size; j++)
if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i]) && TENSOR_EXPECT_COMPUTABLE(tensor_blocks[j]))
{
// If either of the tensor is const, it must interfere with each other.
const uint8_t one = 1;
if (TENSOR_EXPECT_CONST(tensor_blocks[i]) || TENSOR_EXPECT_CONST(tensor_blocks[j]))
ccv_set_sparse_matrix_cell(tensor_itf, i, j, &one);
else {
// Otherwise, check to see if they interfere (default to yes).
// If any of the i's head is deterministically later than j's tail
// or any of the i's tail is deterministically earlier than j's head, they don't interfere.
int i_hop_j = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[i], tensor_blocks[j]);
int j_hop_i = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[j], tensor_blocks[i]);
// It cannot be that both i can hop to j can j can hop to i.
assert(!(i_hop_j > 0 && j_hop_i > 0));
if (!i_hop_j && !j_hop_i)
ccv_set_sparse_matrix_cell(tensor_itf, i, j, &one);
}
}
int* oc = (int*)cccalloc(tensor_block_size, sizeof(int));
for (i = 0; i < tensor_block_size; i++)
for (j = 0; j < tensor_block_size; j++)
// If these two tensors are still alive, analyze them.
if (i != j && TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i]) && TENSOR_EXPECT_COMPUTABLE(tensor_blocks[j]))
{
ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(tensor_itf, ccv_min(i, j), ccv_max(i, j));
// If their life time overlaps, compute how many tensors it overlap.
if (cell.u8 && cell.u8[0] == 1)
++oc[i];
}
int* assigned = (int*)cccalloc(tensor_block_size, sizeof(int));
uint64_t* allocated_offset = (uint64_t*)cccalloc(tensor_block_size, sizeof(uint64_t));
uint64_t* allocated_size = (uint64_t*)cccalloc(tensor_block_size, sizeof(uint64_t));
int num_assigned = 0;
// I can do a bit optimization here to assign out const tensor first, but heck, this just works for now.
// Allocation graph (assuming there is a source node, and a destination node, which is 0, and (tensor_block_size + 1)
// The first channel denotes the bytes available for allocation,
// the second channel denotes the offset available for the allocation,
ccv_sparse_matrix_t* alloc = ccv_sparse_matrix_new(tensor_block_size + 2, tensor_block_size + 2, CCV_64S | CCV_C2, CCV_SPARSE_ROW_MAJOR, 0);
ccv_array_t* opt = ccv_array_new(sizeof(ccv_nnc_tensor_opt_t), 1, 0);
for (j = 0; j < allocable_tensor_size;)
{
// Find the one with largest overlap, and it is not assigned.
int max_oc = 0;
ccv_array_clear(opt);
for (i = 0; i < tensor_block_size; i++)
if (oc[i] >= max_oc && TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i]) && !assigned[i] && IS_PRIMARY_COMPANION(i, tensor_blocks[i]))
{
ccv_nnc_tensor_opt_t a = {
.size = tensor_blocks[i].size,
.index = i,
.companion = -1, // If already have a designated companion, use that.
.oc = oc[i],
};
if (tensor_blocks[i].companion_ref)
{
const int companion_ref = tensor_blocks[i].companion_ref - 1;
a.size = ccv_max(a.size, tensor_blocks[companion_ref].size);
a.oc += oc[companion_ref];
}
// In case we have a tie, take them all in the array.
if (a.oc > max_oc)
ccv_array_clear(opt), max_oc = a.oc;
ccv_array_push(opt, &a);
}
assert(opt->rnum > 0);
// Go through opt array, find all tensors that doesn't interfere with it, and have tensor size larger than it.
// Push them with the "companion" into the opt array as well.
const int rnum = opt->rnum;
for (i = 0; i < rnum; i++)
{
// Copy it out, because after insertion, it may hold invalid pointer.
ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)ccv_array_get(opt, i);
assert(a.companion == -1);
const int companion_ref = tensor_blocks[i].companion_ref - 1;
for (k = 0; k < tensor_block_size; k++)
// Find non-overlapping tensor that has larger size (of course, is unassigned and is not one with designated companion).
if (k != a.index && !tensor_blocks[k].companion_ref && TENSOR_EXPECT_COMPUTABLE(tensor_blocks[k]) && !assigned[k] && tensor_blocks[k].size > a.size)
{
ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(tensor_itf, ccv_min(a.index, k), ccv_max(a.index, k));
// Good, push to opt array.
if (cell.u8 && cell.u8[0] == 1)
continue;
( run in 1.279 second using v1.01-cache-2.11-cpan-5a3173703d6 )