Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/dmlc-core/src/io/input_split_base.cc view on Meta::CPAN
file_ptr_ = std::upper_bound(file_offset_.begin(),
file_offset_.end(),
offset_begin_) - file_offset_.begin() - 1;
file_ptr_end_ = std::upper_bound(file_offset_.begin(),
file_offset_.end(),
offset_end_) - file_offset_.begin() - 1;
if (fs_ != NULL) {
delete fs_; fs_ = NULL;
}
// find the exact ending position
if (offset_end_ != file_offset_[file_ptr_end_]) {
CHECK(offset_end_ >file_offset_[file_ptr_end_]);
CHECK(file_ptr_end_ < files_.size());
fs_ = filesys_->OpenForRead(files_[file_ptr_end_].path);
fs_->Seek(offset_end_ - file_offset_[file_ptr_end_]);
offset_end_ += SeekRecordBegin(fs_);
delete fs_;
}
fs_ = filesys_->OpenForRead(files_[file_ptr_].path);
if (offset_begin_ != file_offset_[file_ptr_]) {
fs_->Seek(offset_begin_ - file_offset_[file_ptr_]);
offset_begin_ += SeekRecordBegin(fs_);
}
this->BeforeFirst();
}
void InputSplitBase::BeforeFirst(void) {
if (offset_begin_ >= offset_end_) return;
size_t fp = std::upper_bound(file_offset_.begin(),
file_offset_.end(),
offset_begin_) - file_offset_.begin() - 1;
if (file_ptr_ != fp) {
delete fs_;
file_ptr_ = fp;
fs_ = filesys_->OpenForRead(files_[file_ptr_].path);
}
// seek to beginning of stream
fs_->Seek(offset_begin_ - file_offset_[file_ptr_]);
offset_curr_ = offset_begin_;
tmp_chunk_.begin = tmp_chunk_.end = NULL;
// clear overflow buffer
overflow_.clear();
}
InputSplitBase::~InputSplitBase(void) {
delete fs_;
// no need to delete filesystem, it was singleton
}
std::string InputSplitBase::StripEnd(std::string str, char ch) {
while (str.length() != 0 && str[str.length() - 1] == ch) {
str.resize(str.length() - 1);
}
return str;
}
void InputSplitBase::InitInputFileInfo(const std::string& uri) {
// split by :
const char dlm = ';';
std::vector<std::string> file_list = Split(uri, dlm);
std::vector<URI> expanded_list;
// expand by match regex pattern.
for (size_t i = 0; i < file_list.size(); ++i) {
URI path(file_list[i].c_str());
size_t pos = path.name.rfind('/');
if (pos == std::string::npos || pos + 1 == path.name.length()) {
expanded_list.push_back(path);
} else {
URI dir = path;
dir.name = path.name.substr(0, pos);
std::vector<FileInfo> dfiles;
filesys_->ListDirectory(dir, &dfiles);
bool exact_match = false;
for (size_t i = 0; i < dfiles.size(); ++i) {
if (StripEnd(dfiles[i].path.name, '/') == StripEnd(path.name, '/')) {
expanded_list.push_back(dfiles[i].path);
exact_match = true;
break;
}
}
#if DMLC_USE_REGEX
if (!exact_match) {
std::string spattern = path.name;
try {
std::regex pattern(spattern);
for (size_t i = 0; i < dfiles.size(); ++i) {
if (dfiles[i].type != kFile || dfiles[i].size == 0) continue;
std::string stripped = StripEnd(dfiles[i].path.name, '/');
std::smatch base_match;
if (std::regex_match(stripped, base_match, pattern)) {
for (size_t j = 0; j < base_match.size(); ++j) {
if (base_match[j].str() == stripped) {
expanded_list.push_back(dfiles[i].path); break;
}
}
}
}
} catch (std::regex_error& e) {
LOG(FATAL) << e.what() << " bad regex " << spattern
<< "This could due to compiler version, g++-4.9 is needed";
}
}
#endif // DMLC_USE_REGEX
}
}
for (size_t i = 0; i < expanded_list.size(); ++i) {
const URI& path = expanded_list[i];
FileInfo info = filesys_->GetPathInfo(path);
if (info.type == kDirectory) {
std::vector<FileInfo> dfiles;
filesys_->ListDirectory(info.path, &dfiles);
for (size_t i = 0; i < dfiles.size(); ++i) {
if (dfiles[i].size != 0 && dfiles[i].type == kFile) {
files_.push_back(dfiles[i]);
}
}
} else {
if (info.size != 0) {
files_.push_back(info);
}
}
}
CHECK_NE(files_.size(), 0U)
<< "Cannot find any files that matches the URI patternz " << uri;
}
size_t InputSplitBase::Read(void *ptr, size_t size) {
if (offset_begin_ >= offset_end_) return 0;
if (offset_curr_ + size > offset_end_) {
size = offset_end_ - offset_curr_;
}
if (size == 0) return 0;
size_t nleft = size;
char *buf = reinterpret_cast<char*>(ptr);
while (true) {
size_t n = fs_->Read(buf, nleft);
nleft -= n; buf += n;
offset_curr_ += n;
if (nleft == 0) break;
if (n == 0) {
if (offset_curr_ != file_offset_[file_ptr_ + 1]) {
LOG(ERROR) << "curr=" << offset_curr_
<< ",begin=" << offset_begin_
<< ",end=" << offset_end_
<< ",fileptr=" << file_ptr_
<< ",fileoffset=" << file_offset_[file_ptr_ + 1];
for (size_t i = 0; i < file_ptr_; ++i) {
LOG(ERROR) << "offset[" << i << "]=" << file_offset_[i];
}
LOG(FATAL) << "file offset not calculated correctly";
}
if (file_ptr_ + 1 >= files_.size()) break;
file_ptr_ += 1;
delete fs_;
fs_ = filesys_->OpenForRead(files_[file_ptr_].path);
}
}
return size - nleft;
}
bool InputSplitBase::ReadChunk(void *buf, size_t *size) {
size_t max_size = *size;
if (max_size <= overflow_.length()) {
*size = 0; return true;
}
if (overflow_.length() != 0) {
std::memcpy(buf, BeginPtr(overflow_), overflow_.length());
( run in 1.358 second using v1.01-cache-2.11-cpan-5b529ec07f3 )