-
Notifications
You must be signed in to change notification settings - Fork 506
Closed
Milestone
Description
Read orc file phenomenon
> orc-content bad_present.orc
Caught exception in bad_present.orc: bad read in RleDecoderV2::readByteHow to generate the bad_present.orc
the bad_presetn.orc file could generate by code below
#include "ByteRLE.hh"
#include "Compression.hh"
#include "RLE.hh"
#include "ToolsHelper.hh"
#include "orc_proto.pb.h"
#include <iostream>
#include <memory>
#include <string>
const int DEFAULT_MEM_STREAM_SIZE = 100 * 1024 * 1024; // 100M
std::unique_ptr<orc::Writer> createWriter(uint64_t stripeSize, uint64_t compresionblockSize,
orc::CompressionKind compression, const orc::Type& type,
orc::MemoryPool* memoryPool, orc::OutputStream* stream,
orc::FileVersion version, uint64_t stride = 0,
const std::string& timezone = "GMT",
bool useTightNumericVector = false) {
orc::WriterOptions options;
options.setStripeSize(stripeSize);
options.setCompressionBlockSize(compresionblockSize);
options.setCompression(compression);
options.setMemoryPool(memoryPool);
options.setRowIndexStride(stride);
options.setFileVersion(version);
options.setTimezoneName(timezone);
options.setUseTightNumericVector(useTightNumericVector);
return orc::createWriter(type, stream, options);
}
int main(int argc, char* argv[]) {
orc::MemoryPool* pool = orc::getDefaultPool();
std::unique_ptr<orc::Type> type(orc::Type::buildTypeFromString("struct<col1:int>"));
uint64_t stripeSize = 16 * 1024; // 16K
uint64_t compressionBlockSize = 1024; // 1k
std::unique_ptr<orc::OutputStream> outStream = orc::writeLocalFile("bad_present.orc");
std::unique_ptr<orc::Writer> writer =
createWriter(stripeSize, compressionBlockSize, orc::CompressionKind_ZLIB, *type, pool,
outStream.get(), orc::FileVersion::v_0_12());
uint64_t batchSize = 5;
uint64_t batchCount = 1000*1024/batchSize;
std::unique_ptr<orc::ColumnVectorBatch> rowBatch = writer->createRowBatch(batchSize);
for (uint64_t batchIdx =0; batchIdx <batchCount; batchIdx++) {
auto* structBatch = dynamic_cast<orc::StructVectorBatch*>(rowBatch.get());
auto* longBatch = dynamic_cast<orc::LongVectorBatch*>(structBatch->fields[0]);
if (batchIdx >= (900*1024/batchSize)) {
for (uint64_t row = 0; row < batchSize; ++row) {
longBatch->notNull[row] = 0;
longBatch->hasNulls = true;
}
} else {
for (uint64_t row = 0; row < batchSize; ++row) {
longBatch->data[row] = static_cast<int64_t>(row);
}
}
structBatch->numElements = batchSize;
longBatch->numElements = batchSize;
writer->add(*rowBatch);
rowBatch->clear();
}
writer->close();
return 0;
}Cause Reason
#1067 optimize orc file by skip write present stream to file when column in current strip has no null value.
ColumnWriter will run notNullEncoder->suppress replace with add present to file streams.
But notNullEncoder->suppress doesn't clear BooleanRleEncoderImpl::current and BooleanRleEncoderImpl::bitsRemained, which may leave it to next strip, this will result with next strip has 1-7 more not null values, finaly it will cause bad read in RleDecoderV2::readByte error, as length stream has less values then present stream.
Metadata
Metadata
Assignees
Labels
No labels