-
Notifications
You must be signed in to change notification settings - Fork 8.3k
Add settings output_format_orc_dictionary_key_size_threshold to allow user to enable dict encoding for string column in ORC output format
#68591
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
13 commits
Select commit
Hold shift + click to select a range
dbd4ee4
enable dict encoding in orc writer
taiyang-li 03ab625
enable string dict encoding in orc output format
taiyang-li b0a0988
change as request
taiyang-li 1011f8e
add uts about orc string encode
taiyang-li d6df83d
add uts about orc string encode
taiyang-li 7aaa028
revert files
taiyang-li aa4688a
fix style
taiyang-li ae58212
change as request
taiyang-li 553c309
Merge branch 'master' into orc_dict_encode
taiyang-li 3d04f3d
Merge branch 'ClickHouse:master' into orc_dict_encode
taiyang-li 0de3b1d
Merge branch 'ClickHouse:master' into orc_dict_encode
taiyang-li 11c7cda
Merge branch 'ClickHouse:master' into orc_dict_encode
taiyang-li 4412946
Merge branch 'master' into orc_dict_encode
taiyang-li File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,311 @@ | ||
| #include <cstdlib> | ||
| #include <base/defines.h> | ||
| #include <benchmark/benchmark.h> | ||
|
|
||
| class OldSortedStringDictionary | ||
| { | ||
| public: | ||
| struct DictEntry | ||
| { | ||
| DictEntry(const char * str, size_t len) : data(str), length(len) { } | ||
| const char * data; | ||
| size_t length; | ||
| }; | ||
|
|
||
| OldSortedStringDictionary() : totalLength(0) { } | ||
|
|
||
| // insert a new string into dictionary, return its insertion order | ||
| size_t insert(const char * str, size_t len); | ||
|
|
||
| // reorder input index buffer from insertion order to dictionary order | ||
| void reorder(std::vector<int64_t> & idxBuffer) const; | ||
|
|
||
| // get dict entries in insertion order | ||
| void getEntriesInInsertionOrder(std::vector<const DictEntry *> &) const; | ||
|
|
||
| size_t size() const; | ||
|
|
||
| // return total length of strings in the dictionary | ||
| uint64_t length() const; | ||
|
|
||
| void clear(); | ||
|
|
||
| // store indexes of insertion order in the dictionary for not-null rows | ||
| std::vector<int64_t> idxInDictBuffer; | ||
|
|
||
| private: | ||
| struct LessThan | ||
| { | ||
| bool operator()(const DictEntry & left, const DictEntry & right) const | ||
| { | ||
| int ret = memcmp(left.data, right.data, std::min(left.length, right.length)); | ||
| if (ret != 0) | ||
| { | ||
| return ret < 0; | ||
| } | ||
| return left.length < right.length; | ||
| } | ||
| }; | ||
|
|
||
| std::map<DictEntry, size_t, LessThan> dict; | ||
| std::vector<std::vector<char>> data; | ||
| uint64_t totalLength; | ||
| }; | ||
|
|
||
| // insert a new string into dictionary, return its insertion order | ||
| size_t OldSortedStringDictionary::insert(const char * str, size_t len) | ||
| { | ||
| auto ret = dict.insert({DictEntry(str, len), dict.size()}); | ||
| if (ret.second) | ||
| { | ||
| // make a copy to internal storage | ||
| data.push_back(std::vector<char>(len)); | ||
| memcpy(data.back().data(), str, len); | ||
| // update dictionary entry to link pointer to internal storage | ||
| DictEntry * entry = const_cast<DictEntry *>(&(ret.first->first)); | ||
| entry->data = data.back().data(); | ||
| totalLength += len; | ||
| } | ||
| return ret.first->second; | ||
| } | ||
|
|
||
| /** | ||
| * Reorder input index buffer from insertion order to dictionary order | ||
| * | ||
| * We require this function because string values are buffered by indexes | ||
| * in their insertion order. Until the entire dictionary is complete can | ||
| * we get their sorted indexes in the dictionary in that ORC specification | ||
| * demands dictionary should be ordered. Therefore this function transforms | ||
| * the indexes from insertion order to dictionary value order for final | ||
| * output. | ||
| */ | ||
| void OldSortedStringDictionary::reorder(std::vector<int64_t> & idxBuffer) const | ||
| { | ||
| // iterate the dictionary to get mapping from insertion order to value order | ||
| std::vector<size_t> mapping(dict.size()); | ||
| size_t dictIdx = 0; | ||
| for (auto it = dict.cbegin(); it != dict.cend(); ++it) | ||
| { | ||
| mapping[it->second] = dictIdx++; | ||
| } | ||
|
|
||
| // do the transformation | ||
| for (size_t i = 0; i != idxBuffer.size(); ++i) | ||
| { | ||
| idxBuffer[i] = static_cast<int64_t>(mapping[static_cast<size_t>(idxBuffer[i])]); | ||
| } | ||
| } | ||
|
|
||
| // get dict entries in insertion order | ||
| void OldSortedStringDictionary::getEntriesInInsertionOrder(std::vector<const DictEntry *> & entries) const | ||
| { | ||
| entries.resize(dict.size()); | ||
| for (auto it = dict.cbegin(); it != dict.cend(); ++it) | ||
| { | ||
| entries[it->second] = &(it->first); | ||
| } | ||
| } | ||
|
|
||
| // return count of entries | ||
| size_t OldSortedStringDictionary::size() const | ||
| { | ||
| return dict.size(); | ||
| } | ||
|
|
||
| // return total length of strings in the dictionary | ||
| uint64_t OldSortedStringDictionary::length() const | ||
| { | ||
| return totalLength; | ||
| } | ||
|
|
||
| void OldSortedStringDictionary::clear() | ||
| { | ||
| totalLength = 0; | ||
| data.clear(); | ||
| dict.clear(); | ||
| } | ||
|
|
||
|
|
||
| /** | ||
| * Implementation of increasing sorted string dictionary | ||
| */ | ||
| class NewSortedStringDictionary | ||
| { | ||
| public: | ||
| struct DictEntry | ||
| { | ||
| DictEntry(const char * str, size_t len) : data(str), length(len) { } | ||
| const char * data; | ||
| size_t length; | ||
| }; | ||
|
|
||
| struct DictEntryWithIndex | ||
| { | ||
| DictEntryWithIndex(const char * str, size_t len, size_t index_) : entry(str, len), index(index_) { } | ||
| DictEntry entry; | ||
| size_t index; | ||
| }; | ||
|
|
||
| NewSortedStringDictionary() : totalLength_(0) { } | ||
|
|
||
| // insert a new string into dictionary, return its insertion order | ||
| size_t insert(const char * str, size_t len); | ||
|
|
||
| // reorder input index buffer from insertion order to dictionary order | ||
| void reorder(std::vector<int64_t> & idxBuffer) const; | ||
|
|
||
| // get dict entries in insertion order | ||
| void getEntriesInInsertionOrder(std::vector<const DictEntry *> &) const; | ||
|
|
||
| // return count of entries | ||
| size_t size() const; | ||
|
|
||
| // return total length of strings in the dictionary | ||
| uint64_t length() const; | ||
|
|
||
| void clear(); | ||
|
|
||
| // store indexes of insertion order in the dictionary for not-null rows | ||
| std::vector<int64_t> idxInDictBuffer; | ||
|
|
||
| private: | ||
| struct LessThan | ||
| { | ||
| bool operator()(const DictEntryWithIndex & l, const DictEntryWithIndex & r) | ||
| { | ||
| const auto & left = l.entry; | ||
| const auto & right = r.entry; | ||
| int ret = memcmp(left.data, right.data, std::min(left.length, right.length)); | ||
| if (ret != 0) | ||
| { | ||
| return ret < 0; | ||
| } | ||
| return left.length < right.length; | ||
| } | ||
| }; | ||
|
|
||
| mutable std::vector<DictEntryWithIndex> flatDict_; | ||
| std::unordered_map<std::string, size_t> keyToIndex; | ||
| uint64_t totalLength_; | ||
| }; | ||
|
|
||
| // insert a new string into dictionary, return its insertion order | ||
| size_t NewSortedStringDictionary::insert(const char * str, size_t len) | ||
| { | ||
| size_t index = flatDict_.size(); | ||
| auto ret = keyToIndex.emplace(std::string(str, len), index); | ||
| if (ret.second) | ||
| { | ||
| flatDict_.emplace_back(ret.first->first.data(), ret.first->first.size(), index); | ||
| totalLength_ += len; | ||
| } | ||
| return ret.first->second; | ||
| } | ||
|
|
||
| /** | ||
| * Reorder input index buffer from insertion order to dictionary order | ||
| * | ||
| * We require this function because string values are buffered by indexes | ||
| * in their insertion order. Until the entire dictionary is complete can | ||
| * we get their sorted indexes in the dictionary in that ORC specification | ||
| * demands dictionary should be ordered. Therefore this function transforms | ||
| * the indexes from insertion order to dictionary value order for final | ||
| * output. | ||
| */ | ||
| void NewSortedStringDictionary::reorder(std::vector<int64_t> & idxBuffer) const | ||
| { | ||
| // iterate the dictionary to get mapping from insertion order to value order | ||
| std::vector<size_t> mapping(flatDict_.size()); | ||
| for (size_t i = 0; i < flatDict_.size(); ++i) | ||
| { | ||
| mapping[flatDict_[i].index] = i; | ||
| } | ||
|
|
||
| // do the transformation | ||
| for (size_t i = 0; i != idxBuffer.size(); ++i) | ||
| { | ||
| idxBuffer[i] = static_cast<int64_t>(mapping[static_cast<size_t>(idxBuffer[i])]); | ||
| } | ||
| } | ||
|
|
||
| // get dict entries in insertion order | ||
| void NewSortedStringDictionary::getEntriesInInsertionOrder(std::vector<const DictEntry *> & entries) const | ||
| { | ||
| std::sort( | ||
| flatDict_.begin(), | ||
| flatDict_.end(), | ||
| [](const DictEntryWithIndex & left, const DictEntryWithIndex & right) { return left.index < right.index; }); | ||
|
|
||
| entries.resize(flatDict_.size()); | ||
| for (size_t i = 0; i < flatDict_.size(); ++i) | ||
| { | ||
| entries[i] = &(flatDict_[i].entry); | ||
| } | ||
| } | ||
|
|
||
| // return count of entries | ||
| size_t NewSortedStringDictionary::size() const | ||
| { | ||
| return flatDict_.size(); | ||
| } | ||
|
|
||
| // return total length of strings in the dictionary | ||
| uint64_t NewSortedStringDictionary::length() const | ||
| { | ||
| return totalLength_; | ||
| } | ||
|
|
||
| void NewSortedStringDictionary::clear() | ||
| { | ||
| totalLength_ = 0; | ||
| keyToIndex.clear(); | ||
| flatDict_.clear(); | ||
| } | ||
|
|
||
| template <size_t cardinality> | ||
| static std::vector<std::string> mockStrings() | ||
| { | ||
| std::vector<std::string> res(1000000); | ||
| for (auto & s : res) | ||
| { | ||
| s = "test string dictionary " + std::to_string(rand() % cardinality); | ||
| } | ||
| return res; | ||
| } | ||
|
|
||
| template <typename DictionaryImpl> | ||
| static NO_INLINE std::unique_ptr<DictionaryImpl> createAndWriteStringDictionary(const std::vector<std::string> & strs) | ||
| { | ||
| auto dict = std::make_unique<DictionaryImpl>(); | ||
| for (const auto & str : strs) | ||
| { | ||
| auto index = dict->insert(str.data(), str.size()); | ||
| dict->idxInDictBuffer.push_back(index); | ||
| } | ||
| dict->reorder(dict->idxInDictBuffer); | ||
|
|
||
| return dict; | ||
| } | ||
|
|
||
| template <typename DictionaryImpl, size_t cardinality> | ||
| static void BM_writeStringDictionary(benchmark::State & state) | ||
| { | ||
| auto strs = mockStrings<cardinality>(); | ||
| for (auto _ : state) | ||
| { | ||
| auto dict = createAndWriteStringDictionary<DictionaryImpl>(strs); | ||
| benchmark::DoNotOptimize(dict); | ||
| } | ||
| } | ||
|
|
||
| BENCHMARK_TEMPLATE(BM_writeStringDictionary, OldSortedStringDictionary, 10); | ||
| BENCHMARK_TEMPLATE(BM_writeStringDictionary, NewSortedStringDictionary, 10); | ||
| BENCHMARK_TEMPLATE(BM_writeStringDictionary, OldSortedStringDictionary, 100); | ||
| BENCHMARK_TEMPLATE(BM_writeStringDictionary, NewSortedStringDictionary, 100); | ||
| BENCHMARK_TEMPLATE(BM_writeStringDictionary, OldSortedStringDictionary, 1000); | ||
| BENCHMARK_TEMPLATE(BM_writeStringDictionary, NewSortedStringDictionary, 1000); | ||
| BENCHMARK_TEMPLATE(BM_writeStringDictionary, OldSortedStringDictionary, 10000); | ||
| BENCHMARK_TEMPLATE(BM_writeStringDictionary, NewSortedStringDictionary, 10000); | ||
| BENCHMARK_TEMPLATE(BM_writeStringDictionary, OldSortedStringDictionary, 100000); | ||
| BENCHMARK_TEMPLATE(BM_writeStringDictionary, NewSortedStringDictionary, 100000); | ||
|
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.