Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/dmlc-core/src/io/input_split_base.h view on Meta::CPAN
/*!
* Copyright (c) 2015 by Contributors
* \file input_split_base.h
* \brief base class to construct input split from multiple files
* \author Tianqi Chen
*/
#ifndef DMLC_IO_INPUT_SPLIT_BASE_H_
#define DMLC_IO_INPUT_SPLIT_BASE_H_
#include <dmlc/io.h>
#include <cstdio>
#include <cstring>
#include <vector>
#include <string>
#include <algorithm>
#include "./filesys.h"
namespace dmlc {
namespace io {
/*! \brief class to construct input split from multiple files */
class InputSplitBase : public InputSplit {
public:
/*!
* \brief helper struct to hold chunk data
* with internal pointer to move along the record
*/
struct Chunk {
char *begin;
char *end;
std::vector<size_t> data;
explicit Chunk(size_t buffer_size)
: begin(NULL), end(NULL),
data(buffer_size + 1) {}
// load chunk from split
bool Load(InputSplitBase *split, size_t buffer_size);
};
// 16 MB
static const size_t kBufferSize = 2UL << 20UL;
// destructor
virtual ~InputSplitBase(void);
// implement BeforeFirst
virtual void BeforeFirst(void);
virtual void HintChunkSize(size_t chunk_size) {
buffer_size_ = std::max(chunk_size / sizeof(size_t), buffer_size_);
}
virtual size_t GetTotalSize(void) {
return file_offset_.back();
}
// implement next record
virtual bool NextRecord(Blob *out_rec) {
while (!ExtractNextRecord(out_rec, &tmp_chunk_)) {
if (!tmp_chunk_.Load(this, buffer_size_)) return false;
}
return true;
}
// implement next chunk
virtual bool NextChunk(Blob *out_chunk) {
while (!ExtractNextChunk(out_chunk, &tmp_chunk_)) {
if (!tmp_chunk_.Load(this, buffer_size_)) return false;
}
return true;
}
// implement ResetPartition.
virtual void ResetPartition(unsigned rank, unsigned nsplit);
/*!
* \brief read a chunk of data into buf
* the data can span multiple records,
* but cannot contain partial records
xgboost/dmlc-core/src/io/input_split_base.h view on Meta::CPAN
* \param nsplit number of splits
* \param align_bytes the head split must be multiple of align_bytes
* this also checks if file size are multiple of align_bytes
*/
void Init(FileSystem *fs,
const char *uri,
size_t align_bytes);
// to be implemented by child class
/*!
* \brief seek to the beginning of the first record
* in current file pointer
* \return how many bytes we read past
*/
virtual size_t SeekRecordBegin(Stream *fi) = 0;
/*!
* \brief find the last occurance of record header
* \param begin beginning of the buffer
* \param end end of the buffer
* \return the pointer between [begin, end] indicating the
* last record head
*/
virtual const char*
FindLastRecordBegin(const char *begin, const char *end) = 0;
private:
/*! \brief FileSystem */
FileSystem *filesys_;
/*! \brief information about files */
std::vector<FileInfo> files_;
/*! \brief current input stream */
SeekStream *fs_;
/*! \brief bytes to be aligned */
size_t align_bytes_;
/*! \brief file pointer of which file to read on */
size_t file_ptr_;
/*! \brief file pointer where the end of file lies */
size_t file_ptr_end_;
/*! \brief get the current offset */
size_t offset_curr_;
/*! \brief beginning of offset */
size_t offset_begin_;
/*! \brief end of the offset */
size_t offset_end_;
/*! \brief temporal chunk */
Chunk tmp_chunk_;
/*! \brief buffer size */
size_t buffer_size_;
/*! \brief byte-offset of each file */
std::vector<size_t> file_offset_;
/*! \brief internal overflow buffer */
std::string overflow_;
/*! \brief initialize information in files */
void InitInputFileInfo(const std::string& uri);
/*! \brief strip continous chars in the end of str */
std::string StripEnd(std::string str, char ch);
/*! \brief same as stream.Read */
size_t Read(void *ptr, size_t size);
};
} // namespace io
} // namespace dmlc
#endif // DMLC_IO_INPUT_SPLIT_BASE_H_
( run in 0.895 second using v1.01-cache-2.11-cpan-39bf76dae61 )