Alien-XGBoost

 view release on metacpan or  search on metacpan

xgboost/dmlc-core/src/io/single_file_split.h  view on Meta::CPAN

/*!
 *  Copyright (c) 2015 by Contributors
 * \file single_file_split.h
 * \brief base implementation of line-spliter
 * \author Tianqi Chen
 */
#ifndef DMLC_IO_SINGLE_FILE_SPLIT_H_
#define DMLC_IO_SINGLE_FILE_SPLIT_H_

#include <dmlc/io.h>
#include <dmlc/logging.h>
#include <sys/stat.h>
#include <cstdio>
#include <string>
#include <algorithm>

#if defined(__FreeBSD__)
#define fopen64 std::fopen
#endif

namespace dmlc {
namespace io {
/*!
 * \brief line split implementation from single FILE
 * simply returns lines of files, used for stdin
 */
class SingleFileSplit : public InputSplit {
 public:
  explicit SingleFileSplit(const char *fname)
      : use_stdin_(false), buffer_size_(kBufferSize),
        chunk_begin_(NULL), chunk_end_(NULL) {
    if (!std::strcmp(fname, "stdin")) {
#ifndef DMLC_STRICT_CXX98_
      use_stdin_ = true; fp_ = stdin;
#endif
    }
    if (!use_stdin_) {
      fp_ = fopen64(fname, "rb");
      CHECK(fp_ != NULL) << "SingleFileSplit: fail to open " << fname;
    }
    buffer_.resize(kBufferSize);
  }
  virtual ~SingleFileSplit(void) {
    if (!use_stdin_) std::fclose(fp_);
  }
  virtual void BeforeFirst(void) {
    fseek(fp_, 0, SEEK_SET);
  }
  virtual void HintChunkSize(size_t chunk_size) {
    buffer_size_ = std::max(chunk_size, buffer_size_);
  }
  virtual size_t GetTotalSize(void) {
    struct stat buf;
    fstat(fileno(fp_), &buf);
    return buf.st_size;
  }
  virtual size_t Read(void *ptr, size_t size) {
    return std::fread(ptr, 1, size, fp_);
  }
  virtual void ResetPartition(unsigned part_index, unsigned num_parts) {
    CHECK(part_index == 0 && num_parts == 1);
    this->BeforeFirst();
  }
  virtual void Write(const void *ptr, size_t size) {
    LOG(FATAL) << "InputSplit do not support write";
  }
  virtual bool NextRecord(Blob *out_rec) {
    if (chunk_begin_ == chunk_end_) {
      if (!LoadChunk()) return false;
    }
    char *next = FindNextRecord(chunk_begin_,
                                chunk_end_);
    out_rec->dptr = chunk_begin_;
    out_rec->size = next - chunk_begin_;
    chunk_begin_ = next;
    return true;
  }
  virtual bool NextChunk(Blob *out_chunk) {
    if (chunk_begin_ == chunk_end_) {
      if (!LoadChunk()) return false;
    }
    out_chunk->dptr = chunk_begin_;
    out_chunk->size = chunk_end_ - chunk_begin_;
    chunk_begin_ = chunk_end_;
    return true;
  }
  inline bool ReadChunk(void *buf, size_t *size) {
    size_t max_size = *size;
    if (max_size <= overflow_.length()) {
      *size = 0; return true;
    }
    if (overflow_.length() != 0) {
      std::memcpy(buf, BeginPtr(overflow_), overflow_.length());
    }
    size_t olen = overflow_.length();
    overflow_.resize(0);
    size_t nread = this->Read(reinterpret_cast<char*>(buf) + olen,
                              max_size - olen);
    nread += olen;
    if (nread == 0) return false;
    if (nread != max_size) {
      *size = nread;
      return true;
    } else {
      const char *bptr = reinterpret_cast<const char*>(buf);
      // return the last position where a record starts
      const char *bend = this->FindLastRecordBegin(bptr, bptr + max_size);
      *size = bend - bptr;
      overflow_.resize(max_size - *size);
      if (overflow_.length() != 0) {
        std::memcpy(BeginPtr(overflow_), bend, overflow_.length());
      }
      return true;
    }



( run in 0.597 second using v1.01-cache-2.11-cpan-cdf2f3d4e48 )