Alien-XGBoost

 view release on metacpan or  search on metacpan

xgboost/src/data/sparse_page_dmatrix.cc  view on Meta::CPAN

        const SparseBatch::Entry &e = prow.data[j];
        builder.Push(e.index,
                     SparseBatch::Entry(buffered_rowset_[i + begin], e.fvalue),
                     tid);
      }
    }
    CHECK_EQ(pcol->Size(), info.num_col);
    // sort columns
    bst_omp_uint ncol = static_cast<bst_omp_uint>(pcol->Size());
    #pragma omp parallel for schedule(dynamic, 1) num_threads(nthread)
    for (bst_omp_uint i = 0; i < ncol; ++i) {
      if (pcol->offset[i] < pcol->offset[i + 1]) {
        std::sort(dmlc::BeginPtr(pcol->data) + pcol->offset[i],
                  dmlc::BeginPtr(pcol->data) + pcol->offset[i + 1],
                  SparseBatch::Entry::CmpValue);
      }
    }
  };

  auto make_next_col = [&] (SparsePage* dptr) {
    tmp.Clear();
    size_t btop = buffered_rowset_.size();

    while (true) {
      if (batch_ptr != batch_top) {
        const RowBatch& batch = iter->Value();
        CHECK_EQ(batch_top, batch.size);
        for (size_t i = batch_ptr; i < batch_top; ++i) {
          bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
          if (pkeep == 1.0f || coin_flip(rnd)) {
            buffered_rowset_.push_back(ridx);
            tmp.Push(batch[i]);
          }

          if (tmp.Size() >= max_row_perbatch ||
              tmp.MemCostBytes() >= kPageSize) {
            make_col_batch(tmp, btop, dptr);
            batch_ptr = i + 1;
            return true;
          }
        }
        batch_ptr = batch_top;
      }
      if (!iter->Next()) break;
      batch_ptr = 0;
      batch_top = iter->Value().size;
    }

    if (tmp.Size() != 0) {
      make_col_batch(tmp, btop, dptr);
      return true;
    } else {
      return false;
    }
  };

  std::vector<std::string> cache_shards = common::Split(cache_info_, ':');
  std::vector<std::string> name_shards, format_shards;
  for (const std::string& prefix : cache_shards) {
    name_shards.push_back(prefix + ".col.page");
    format_shards.push_back(SparsePage::Format::DecideFormat(prefix).second);
  }

  {
    SparsePage::Writer writer(name_shards, format_shards, 6);
    std::shared_ptr<SparsePage> page;
    writer.Alloc(&page); page->Clear();

    double tstart = dmlc::GetTime();
    size_t bytes_write = 0;
    // print every 4 sec.
    const double kStep = 4.0;
    size_t tick_expected = kStep;

    while (make_next_col(page.get())) {
      for (size_t i = 0; i < page->Size(); ++i) {
        col_size_[i] += page->offset[i + 1] - page->offset[i];
      }

      bytes_write += page->MemCostBytes();
      writer.PushWrite(std::move(page));
      writer.Alloc(&page);
      page->Clear();

      double tdiff = dmlc::GetTime() - tstart;
      if (tdiff >= tick_expected) {
        LOG(CONSOLE) << "Writing col.page file to " << cache_info_
                     << " in " << ((bytes_write >> 20UL) / tdiff) << " MB/s, "
                     << (bytes_write >> 20UL) << " MB writen";
        tick_expected += kStep;
      }
    }
    // save meta data
    std::string col_meta_name = cache_shards[0] + ".col.meta";
    std::unique_ptr<dmlc::Stream> fo(
        dmlc::Stream::Create(col_meta_name.c_str(), "w"));
    fo->Write(buffered_rowset_);
    fo->Write(col_size_);
    fo.reset(nullptr);
  }
  // initialize column data
  CHECK(TryInitColData());
}

}  // namespace data
}  // namespace xgboost
#endif



( run in 0.414 second using v1.01-cache-2.11-cpan-39bf76dae61 )