diff --git a/.asf.yaml b/.asf.yaml index 14178a61c8..3133513c49 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -24,12 +24,17 @@ github: merge: false squash: true rebase: true + ghp_branch: main + ghp_path: /site labels: - apache - orc - java - cpp - big-data + protected_tags: + - "rel/*" + - "v*.*.*" notifications: pullrequests: issues@orc.apache.org issues: issues@orc.apache.org diff --git a/.clang-tidy b/.clang-tidy index bd995bca54..b401f8948b 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -21,13 +21,14 @@ Checks: "-*, CheckOptions: [ + { key: readability-identifier-naming.ParameterCase, value: "camelBack" }, + { key: readability-identifier-naming.PrivateMemberCase, value: "camelBack"}, { key: readability-identifier-naming.PrivateMemberSuffix, value: "_" }, { key: readability-identifier-naming.ProtectedMemberSuffix, value: "" }, { key: readability-identifier-naming.PublicMemberSuffix, value: "" }, - { key: readability-identifier-naming.ParameterCase, value: "camelBack" }, { key: readability-identifier-naming.ParameterIgnoredRegexp, value: "^[a-zA-Z]$" }, ] WarningsAsErrors: '' -HeaderFilterRegex: '.*' +HeaderFilterRegex: '(orc/c\+\+/|orc/tools)' FormatStyle: none \ No newline at end of file diff --git a/.github/.licenserc.yaml b/.github/.licenserc.yaml index a66db6601f..a16671e9d6 100644 --- a/.github/.licenserc.yaml +++ b/.github/.licenserc.yaml @@ -22,5 +22,6 @@ header: - 'NOTICE' - '.clang-format' - '.asf.yaml' + - '.nojekyll' comment: on-failure diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 8eddbcdea3..5722a9d3ba 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -20,9 +20,9 @@ updates: schedule: interval: "weekly" ignore: - # Pin gson to 2.2.4 because of Hive + # Pin gson to 2.9.0 because of Hive - dependency-name: "com.google.code.gson:gson" - versions: "[2.3,)" + versions: "[2.9,1)" # Pin jodd-core to 3.5.2 - dependency-name: "org.jodd:jodd-core" versions: "[3.5.3,)" diff --git a/.github/lsan-suppressions.txt b/.github/lsan-suppressions.txt new file mode 100644 index 0000000000..fc26ee8754 --- /dev/null +++ b/.github/lsan-suppressions.txt @@ -0,0 +1,21 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Add specific leak suppressions here if needed +# Format: +# leak:SymbolName +# leak:source_file.cc diff --git a/.github/workflows/asan_test.yml b/.github/workflows/asan_test.yml new file mode 100644 index 0000000000..f4a31525f3 --- /dev/null +++ b/.github/workflows/asan_test.yml @@ -0,0 +1,64 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Address Sanitizer Tests + +on: + pull_request: + paths-ignore: + - 'site/**' + - 'conan/**' + branches: + - main + +concurrency: + group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.event.number || github.sha }} + cancel-in-progress: true + +jobs: + asan-test: + name: "ASAN with ${{ matrix.compiler }} on Ubuntu" + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + compiler: [gcc, clang] + include: + - compiler: gcc + cc: gcc + cxx: g++ + - compiler: clang + cc: clang + cxx: clang++ + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Configure and Build with ASAN + env: + CC: ${{ matrix.cc }} + CXX: ${{ matrix.cxx }} + run: | + mkdir build && cd build + cmake .. -DCMAKE_BUILD_TYPE=Debug -DENABLE_ASAN=ON -DBUILD_JAVA=OFF + make + - name: Run Tests + working-directory: build + env: + ASAN_OPTIONS: detect_leaks=1:symbolize=1:strict_string_checks=1:halt_on_error=0:detect_container_overflow=0 + LSAN_OPTIONS: suppressions=${{ github.workspace }}/.github/lsan-suppressions.txt + run: | + ctest --output-on-failure diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index b0350193ba..2cebbe4c14 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + name: Build and test on: @@ -32,6 +49,7 @@ jobs: - ubuntu24 - fedora37 - oraclelinux9 + - amazonlinux23 steps: - name: Checkout uses: actions/checkout@v2 @@ -47,11 +65,12 @@ jobs: fail-fast: false matrix: os: - - ubuntu-20.04 - ubuntu-22.04 - - macos-12 + - ubuntu-24.04 + - ubuntu-24.04-arm - macos-13 - macos-14 + - macos-15 java: - 17 - 21 @@ -67,28 +86,18 @@ jobs: steps: - name: Checkout uses: actions/checkout@v2 - - name: Cache Maven local repository - uses: actions/cache@v2 - with: - path: ~/.m2/repository - key: ${{ matrix.java }}-maven-${{ hashFiles('**/pom.xml') }} - restore-keys: | - ${{ matrix.java }}-maven- - name: Install Java ${{ matrix.java }} - uses: actions/setup-java@v3 + uses: actions/setup-java@v4 with: distribution: zulu java-version: ${{ matrix.java }} + cache: 'maven' - name: "Test" run: | mkdir -p ~/.m2 mkdir build cd build - if [ "${{ matrix.os }}" = "ubuntu-20.04" ]; then - cmake -DANALYZE_JAVA=ON -DCMAKE_CXX_COMPILER=${{ matrix.cxx }} -DSTOP_BUILD_ON_WARNING=OFF .. - else - cmake -DANALYZE_JAVA=ON -DOPENSSL_ROOT_DIR=`brew --prefix openssl@1.1` .. - fi + cmake -DANALYZE_JAVA=ON -DOPENSSL_ROOT_DIR=`brew --prefix openssl@1.1` .. make package test-out - name: Step on failure if: ${{ failure() }} @@ -151,15 +160,16 @@ jobs: doc: name: "Javadoc generation" - runs-on: ubuntu-20.04 + runs-on: ubuntu-24.04 steps: - name: Checkout uses: actions/checkout@v2 - name: Install Java 17 - uses: actions/setup-java@v3 + uses: actions/setup-java@v4 with: distribution: zulu java-version: 17 + cache: 'maven' - name: "javadoc" run: | mkdir -p ~/.m2 @@ -167,21 +177,33 @@ jobs: ./mvnw install -DskipTests ./mvnw javadoc:javadoc - formatting-check: - name: "C++ format check" - runs-on: ubuntu-20.04 - strategy: - matrix: - path: - - 'c++' - - 'tools' + cpp-linter: + runs-on: ubuntu-24.04 steps: - - uses: actions/checkout@v3 - - name: Run clang-format style check for C++ code - uses: jidicula/clang-format-action@v4.9.0 - with: - clang-format-version: '13' - check-path: ${{ matrix.path }} + - uses: actions/checkout@v4 + - name: Run build + run: | + mkdir build && cd build + cmake .. -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DBUILD_JAVA=OFF + cmake --build . + - uses: cpp-linter/cpp-linter-action@v2.13.3 + id: linter + continue-on-error: true + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + style: file + tidy-checks: file + files-changed-only: true + lines-changed-only: true + thread-comments: true + ignore: 'build|cmake_modules|conan|dev|docker|examples|java|site' + database: build + - name: Fail fast?! + if: steps.linter.outputs.checks-failed != 0 + run: | + echo "some linter checks failed. ${{ steps.linter.outputs.checks-failed }}" + exit 1 license-check: name: "License Check" @@ -196,3 +218,25 @@ jobs: with: config: .github/.licenserc.yaml + macos-cpp-check: + name: "C++ Test on macOS" + strategy: + fail-fast: false + matrix: + version: [13, 14] + runs-on: macos-${{ matrix.version }} + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Install dependencies + run: | + brew update + brew install protobuf + - name: Test + run: | + CMAKE_PREFIX_PATH=$(brew --prefix protobuf) + mkdir -p build + cd build + cmake .. -DBUILD_JAVA=OFF -DPROTOBUF_HOME=${CMAKE_PREFIX_PATH} + make package test-out + diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml new file mode 100644 index 0000000000..52b2e1fc7b --- /dev/null +++ b/.github/workflows/pages.yml @@ -0,0 +1,72 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +name: GitHub Pages deployment + +on: + push: + branches: + - main + +concurrency: + group: 'docs preview' + cancel-in-progress: false + +jobs: + docs: + name: Build and deploy documentation + runs-on: ubuntu-latest + permissions: + id-token: write + pages: write + environment: + name: github-pages # https://github.com/actions/deploy-pages/issues/271 + if: github.repository == 'apache/orc' + steps: + - name: Checkout ORC repository + uses: actions/checkout@v4 + with: + repository: apache/orc + ref: 'main' + - name: Install Java 17 + uses: actions/setup-java@v4 + with: + distribution: zulu + java-version: 17 + - name: Install Ruby for documentation generation + uses: ruby/setup-ruby@v1 + with: + ruby-version: '3.3' + bundler-cache: true + - name: Run documentation build + run: | + cd site + gem install bundler -n /usr/local/bin + bundle install --retry=100 + git clone https://github.com/apache/orc.git -b asf-site target + bundle exec jekyll build -b /orc + - name: Setup Pages + uses: actions/configure-pages@v5 + - name: Upload artifact + uses: actions/upload-pages-artifact@v3 + with: + path: 'site/target' + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 0000000000..e69de29bb2 diff --git a/CMakeLists.txt b/CMakeLists.txt index 1fb0e755d6..e6ce4fde0a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,10 +27,11 @@ project(ORC C CXX) # Version number of package SET(CPACK_PACKAGE_VERSION_MAJOR "2") -SET(CPACK_PACKAGE_VERSION_MINOR "1") +SET(CPACK_PACKAGE_VERSION_MINOR "2") SET(CPACK_PACKAGE_VERSION_PATCH "0-SNAPSHOT") SET(ORC_VERSION "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}") -set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PROJECT_SOURCE_DIR}/cmake_modules") +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # For clang-tidy. +list(PREPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules") option (BUILD_JAVA "Include ORC Java library in the build process" @@ -42,7 +43,7 @@ option (ANALYZE_JAVA option (BUILD_LIBHDFSPP "Include LIBHDFSPP library in the build process" - ON) + OFF) option(BUILD_CPP_TESTS "Build the googletest unit tests" @@ -76,10 +77,18 @@ option(BUILD_ENABLE_AVX512 "Enable build with AVX512 at compile time" OFF) +option(ENABLE_ASAN + "Enable Address Sanitizer" + OFF) + option(ORC_PACKAGE_KIND "Arbitrary string that identifies the kind of package" "") +option(ORC_ENABLE_CLANG_TOOLS + "Enable Clang tools" + OFF) + # Make sure that a build type is selected if (NOT CMAKE_BUILD_TYPE) message(STATUS "No build type selected, default to ReleaseWithDebugInfo") @@ -151,17 +160,27 @@ elseif (MSVC) set (WARN_FLAGS "${WARN_FLAGS} -wd4521") # multiple copy constructors specified set (WARN_FLAGS "${WARN_FLAGS} -wd4146") # unary minus operator applied to unsigned type, result still unsigned endif () +# Configure Address Sanitizer if enabled +if (ENABLE_ASAN) + if (CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address -fno-omit-frame-pointer") + message(STATUS "Address Sanitizer enabled") + else() + message(WARNING "Address Sanitizer is only supported for GCC and Clang compilers") + endif() +endif() -if (BUILD_CPP_ENABLE_METRICS) - message(STATUS "Enable the metrics collection") - add_compile_definitions(ENABLE_METRICS=1) +enable_testing() + +INCLUDE(GNUInstallDirs) # Put it before ThirdpartyToolchain to make CMAKE_INSTALL_LIBDIR available. + +if (ORC_PACKAGE_KIND STREQUAL "vcpkg") + set(ORC_INSTALL_CMAKE_DIR ${CMAKE_INSTALL_DATAROOTDIR}/orc) else () - message(STATUS "Disable the metrics collection") - add_compile_definitions(ENABLE_METRICS=0) + set(ORC_INSTALL_CMAKE_DIR ${CMAKE_INSTALL_LIBDIR}/cmake/orc) endif () -enable_testing() - INCLUDE(CheckSourceCompiles) INCLUDE(ThirdpartyToolchain) @@ -180,7 +199,7 @@ if (BUILD_ENABLE_AVX512 AND NOT APPLE) INCLUDE(ConfigSimdLevel) endif () -set (EXAMPLE_DIRECTORY ${CMAKE_SOURCE_DIR}/examples) +set (EXAMPLE_DIRECTORY ${PROJECT_SOURCE_DIR}/examples) add_subdirectory(c++) @@ -210,3 +229,7 @@ if (BUILD_CPP_TESTS) ) endif () endif () + +if (ORC_ENABLE_CLANG_TOOLS) + INCLUDE(CheckFormat) +endif () diff --git a/README.md b/README.md index 60b0da5fcb..cf5c5d0793 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ The subdirectories are: ### Building * Install java 17 or higher -* Install maven 3.9.6 or higher +* Install maven 3.9.9 or higher * Install cmake 3.12 or higher To build a release version with debug information: diff --git a/c++/CMakeLists.txt b/c++/CMakeLists.txt index 449bd10f3e..38c38f7ce4 100644 --- a/c++/CMakeLists.txt +++ b/c++/CMakeLists.txt @@ -15,14 +15,23 @@ # specific language governing permissions and limitations # under the License. -include_directories ( - ${CMAKE_CURRENT_BINARY_DIR}/include - "include" - ) - add_subdirectory(include) add_subdirectory(src) if (BUILD_CPP_TESTS) add_subdirectory(test) endif () + +# Generate cmake package configuration files +include(CMakePackageConfigHelpers) +configure_package_config_file( + orcConfig.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/orcConfig.cmake + INSTALL_DESTINATION ${ORC_INSTALL_CMAKE_DIR}) +write_basic_package_version_file( + ${CMAKE_CURRENT_BINARY_DIR}/orcConfigVersion.cmake + VERSION ${ORC_VERSION} + COMPATIBILITY SameMajorVersion) +install(FILES + ${CMAKE_CURRENT_BINARY_DIR}/orcConfig.cmake + ${CMAKE_CURRENT_BINARY_DIR}/orcConfigVersion.cmake + DESTINATION ${ORC_INSTALL_CMAKE_DIR}) diff --git a/c++/build-support/README.md b/c++/build-support/README.md new file mode 100644 index 0000000000..80966104bb --- /dev/null +++ b/c++/build-support/README.md @@ -0,0 +1,30 @@ +# Build support + +The Python scripts under the folder provide capabilities for formatting code. +Make sure you've installed `clang-format-13`, `clang-tidy-13` and `clang-apply-replacements-13` and cmake could find them. +We enforce the version of tools because different versions of tools may generate different results. + +## clang-format + +To use `run_clang_format.py` you could act like below: + +```shell +mkdir build +cd build +cmake .. -DBUILD_JAVA=OFF -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang -DCMAKE_EXPORT_COMPILE_COMMANDS=1 -DORC_ENABLE_CLANG_TOOLS=1 +make check-format # Do checks only +make format # This would apply suggested changes, take care! +``` + +## clang-tidy + +To use `run_clang_tidy.py` you could act like below: + +```shell +mkdir build +cd build +cmake .. -DBUILD_JAVA=OFF -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang -DCMAKE_EXPORT_COMPILE_COMMANDS=1 -DORC_ENABLE_CLANG_TOOLS=1 +make -j`nproc` # Important +make check-clang-tidy # Do checks only +make fix-clang-tidy # This would apply suggested changes, take care! +``` diff --git a/c++/build-support/run_clang_format.py b/c++/build-support/run_clang_format.py new file mode 100644 index 0000000000..52d2e6b255 --- /dev/null +++ b/c++/build-support/run_clang_format.py @@ -0,0 +1,132 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import argparse +import codecs +import difflib +import fnmatch +import os +import subprocess +import sys + + +def check(arguments, source_dir): + formatted_filenames = [] + error = False + for directory, subdirs, filenames in os.walk(source_dir): + fullpaths = (os.path.join(directory, filename) + for filename in filenames) + source_files = [x for x in fullpaths + if x.endswith(".hh") or x.endswith(".cc")] + formatted_filenames.extend( + # Filter out files that match the globs in the globs file + [filename for filename in source_files + if not any((fnmatch.fnmatch(filename, exclude_glob) + for exclude_glob in exclude_globs))]) + + if arguments.fix: + if not arguments.quiet: + # Print out each file on its own line, but run + # clang format once for all of the files + print("\n".join(map(lambda x: "Formatting {}".format(x), + formatted_filenames))) + subprocess.check_call([arguments.clang_format_binary, + "-i"] + formatted_filenames) + else: + for filename in formatted_filenames: + if not arguments.quiet: + print("Checking {}".format(filename)) + # + # Due to some incompatibilities between Python 2 and + # Python 3, there are some specific actions we take here + # to make sure the difflib.unified_diff call works. + # + # In Python 2, the call to subprocess.check_output return + # a 'str' type. In Python 3, however, the call returns a + # 'bytes' type unless the 'encoding' argument is + # specified. Unfortunately, the 'encoding' argument is not + # in the Python 2 API. We could do an if/else here based + # on the version of Python we are running, but it's more + # straightforward to read the file in binary and do utf-8 + # conversion. In Python 2, it's just converting string + # types to unicode types, whereas in Python 3 it's + # converting bytes types to utf-8 encoded str types. This + # approach ensures that the arguments to + # difflib.unified_diff are acceptable string types in both + # Python 2 and Python 3. + with open(filename, "rb") as reader: + # Run clang-format and capture its output + formatted = subprocess.check_output( + [arguments.clang_format_binary, + filename]) + formatted = codecs.decode(formatted, "utf-8") + # Read the original file + original = codecs.decode(reader.read(), "utf-8") + # Run the equivalent of diff -u + diff = list(difflib.unified_diff( + original.splitlines(True), + formatted.splitlines(True), + fromfile=filename, + tofile="{} (after clang format)".format( + filename))) + if diff: + print("{} had clang-format style issues".format(filename)) + # Print out the diff to stderr + error = True + sys.stderr.writelines(diff) + return error + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Runs clang format on all of the source " + "files. If --fix is specified, and compares the output " + "with the existing file, outputting a unifiied diff if " + "there are any necessary changes") + parser.add_argument("clang_format_binary", + help="Path to the clang-format binary") + parser.add_argument("--exclude_globs", + help="Filename containing globs for files " + "that should be excluded from the checks") + parser.add_argument("--source_dirs", + help="Comma-separated root directories of the code") + parser.add_argument("--fix", default=False, + action="http://github.com/store_true", + help="If specified, will re-format the source " + "code instead of comparing the re-formatted " + "output, defaults to %(default)s") + parser.add_argument("--quiet", default=False, + action="http://github.com/store_true", + help="If specified, only print errors") + + args = parser.parse_args() + + had_err = False + exclude_globs = [] + if args.exclude_globs: + for line in open(args.exclude_globs): + if line.strip() == "": + continue + if line[0] == "#": + continue + exclude_globs.append(line.strip()) + + for source_dir in args.source_dirs.split(','): + if len(source_dir) > 0: + had_err = had_err or check(args, source_dir) + + sys.exit(1 if had_err else 0) \ No newline at end of file diff --git a/run_clang_tidy.py b/c++/build-support/run_clang_tidy.py old mode 100644 new mode 100755 similarity index 100% rename from run_clang_tidy.py rename to c++/build-support/run_clang_tidy.py diff --git a/c++/include/CMakeLists.txt b/c++/include/CMakeLists.txt index 056d1b9fab..a9f8b4a3b5 100644 --- a/c++/include/CMakeLists.txt +++ b/c++/include/CMakeLists.txt @@ -22,10 +22,11 @@ configure_file ( install(FILES "${CMAKE_CURRENT_BINARY_DIR}/orc/orc-config.hh" - DESTINATION "include/orc" + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/orc" ) install(DIRECTORY - "orc/" - DESTINATION "include/orc" - FILES_MATCHING PATTERN "*.hh") + "orc/" + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/orc" + FILES_MATCHING PATTERN "*.hh" + ) diff --git a/c++/include/orc/Exceptions.hh b/c++/include/orc/Exceptions.hh index 97cf5d8a0d..b19a00760c 100644 --- a/c++/include/orc/Exceptions.hh +++ b/c++/include/orc/Exceptions.hh @@ -67,6 +67,18 @@ namespace orc { SchemaEvolutionError(const SchemaEvolutionError&); SchemaEvolutionError& operator=(const SchemaEvolutionError&) = delete; }; + + class CompressionError : public std::runtime_error { + public: + explicit CompressionError(const std::string& whatArg); + explicit CompressionError(const char* whatArg); + ~CompressionError() noexcept override; + CompressionError(const CompressionError&); + + private: + CompressionError& operator=(const CompressionError&); + }; + } // namespace orc #endif diff --git a/c++/include/orc/OrcFile.hh b/c++/include/orc/OrcFile.hh index 6e4a07bf7c..ea71567c5f 100644 --- a/c++/include/orc/OrcFile.hh +++ b/c++/include/orc/OrcFile.hh @@ -19,6 +19,7 @@ #ifndef ORC_FILE_HH #define ORC_FILE_HH +#include #include #include "orc/Reader.hh" @@ -58,6 +59,18 @@ namespace orc { */ virtual void read(void* buf, uint64_t length, uint64_t offset) = 0; + /** + * Read data asynchronously into the buffer. The buffer is allocated by the caller. + * @param buf the buffer to read into + * @param length the number of bytes to read. + * @param offset the position in the stream to read from. + * @return a future that will be set when the read is complete. + */ + virtual std::future readAsync(void* buf, uint64_t length, uint64_t offset) { + return std::async(std::launch::async, + [this, buf, length, offset] { this->read(buf, length, offset); }); + } + /** * Get the name of the stream for error messages. */ @@ -127,8 +140,8 @@ namespace orc { * @param path the uri of the file in HDFS * @param metrics the metrics of the reader */ - std::unique_ptr readHdfsFile(const std::string& path, - ReaderMetrics* metrics = nullptr); + [[deprecated("readHdfsFile is deprecated in 2.0.1")]] std::unique_ptr readHdfsFile( + const std::string& path, ReaderMetrics* metrics = nullptr); /** * Create a reader to read the ORC file. diff --git a/c++/include/orc/Reader.hh b/c++/include/orc/Reader.hh index 4b254593ee..e9f420f113 100644 --- a/c++/include/orc/Reader.hh +++ b/c++/include/orc/Reader.hh @@ -40,6 +40,17 @@ namespace orc { struct ReaderOptionsPrivate; struct RowReaderOptionsPrivate; + struct CacheOptions { + // The maximum distance in bytes between two consecutive + // ranges; beyond this value, ranges are not combined + uint64_t holeSizeLimit = 8192; + + // The maximum size in bytes of a combined range; if + // combining two consecutive ranges would produce a range of a + // size greater than this, they are not combined + uint64_t rangeSizeLimit = 32 * 1024 * 1024; + }; + /** * Expose the reader metrics including the latency and * number of calls of the decompression/decoding/IO modules. @@ -59,9 +70,20 @@ namespace orc { std::atomic IOBlockingLatencyUs{0}; std::atomic SelectedRowGroupCount{0}; std::atomic EvaluatedRowGroupCount{0}; + std::atomic ReadRangeCacheHits{0}; + std::atomic ReadRangeCacheMisses{0}; }; ReaderMetrics* getDefaultReaderMetrics(); + // Row group index of a single column in a stripe. + struct RowGroupIndex { + // Positions are represented as a two-dimensional array where the first + // dimension is row group index and the second dimension is the position + // list of the row group. The size of the second dimension should be equal + // among all row groups. + std::vector> positions; + }; + /** * Options for creating a Reader. */ @@ -107,6 +129,11 @@ namespace orc { */ ReaderOptions& setReaderMetrics(ReaderMetrics* metrics); + /** + * Set the cache options. + */ + ReaderOptions& setCacheOptions(const CacheOptions& cacheOptions); + /** * Set the location of the tail as defined by the logical length of the * file. @@ -138,6 +165,11 @@ namespace orc { * Get the reader metrics. */ ReaderMetrics* getReaderMetrics() const; + + /** + * Set the cache options. + */ + const CacheOptions& getCacheOptions() const; }; /** @@ -466,9 +498,11 @@ namespace orc { /** * Get the statistics about a stripe. * @param stripeIndex the index of the stripe (0 to N-1) to get statistics about - * @return the statistics about that stripe + * @param includeRowIndex whether the row index of the stripe is included + * @return the statistics about that stripe and row group index statistics */ - virtual std::unique_ptr getStripeStatistics(uint64_t stripeIndex) const = 0; + virtual std::unique_ptr getStripeStatistics( + uint64_t stripeIndex, bool includeRowIndex = true) const = 0; /** * Get the length of the data stripes in the file. @@ -605,6 +639,33 @@ namespace orc { */ virtual std::map getBloomFilters( uint32_t stripeIndex, const std::set& included) const = 0; + + /** + * Get row group index of all selected columns in the specified stripe + * @param stripeIndex index of the stripe to be read for row group index. + * @param included index of selected columns to return (if not specified, + * all columns will be returned). + * @return map of row group index keyed by its column index. + */ + virtual std::map getRowGroupIndex( + uint32_t stripeIndex, const std::set& included = {}) const = 0; + + /** + * Trigger IO prefetch and cache the prefetched contents asynchronously. + * It is thread safe. Users should make sure requested stripes and columns + * are not overlapped, otherwise the overlapping part will be prefetched multiple time, + * which doesn't affect correctness but waste IO and memory resources. + * @param stripes the stripes to prefetch + * @param includeTypes the types to prefetch + */ + virtual void preBuffer(const std::vector& stripes, + const std::list& includeTypes) = 0; + + /** + * Release cached entries whose right boundary is less than or equal to the given boundary. + * @param boundary the boundary value to release cache entries + */ + virtual void releaseBuffer(uint64_t boundary) = 0; }; /** diff --git a/c++/include/orc/Vector.hh b/c++/include/orc/Vector.hh index 0dfe926965..663bef9cd7 100644 --- a/c++/include/orc/Vector.hh +++ b/c++/include/orc/Vector.hh @@ -57,6 +57,8 @@ namespace orc { bool hasNulls; // whether the vector batch is encoded bool isEncoded; + // whether the dictionary is decoded into vector batch + bool dictionaryDecoded; // custom memory pool MemoryPool& memoryPool; @@ -88,6 +90,14 @@ namespace orc { */ virtual bool hasVariableLength(); + /** + * Decode possible dictionary into vector batch. + */ + void decodeDictionary(); + + protected: + virtual void decodeDictionaryImpl() {} + private: ColumnVectorBatch(const ColumnVectorBatch&); ColumnVectorBatch& operator=(const ColumnVectorBatch&); @@ -248,6 +258,10 @@ namespace orc { ~EncodedStringVectorBatch() override; std::string toString() const override; void resize(uint64_t capacity) override; + + // Calculate data and length in StringVectorBatch from dictionary and index + void decodeDictionaryImpl() override; + std::shared_ptr dictionary; // index for dictionary entry @@ -264,6 +278,9 @@ namespace orc { bool hasVariableLength() override; std::vector fields; + + protected: + void decodeDictionaryImpl() override; }; struct ListVectorBatch : public ColumnVectorBatch { @@ -283,6 +300,9 @@ namespace orc { // the concatenated elements std::unique_ptr elements; + + protected: + void decodeDictionaryImpl() override; }; struct MapVectorBatch : public ColumnVectorBatch { @@ -304,6 +324,9 @@ namespace orc { std::unique_ptr keys; // the concatenated elements std::unique_ptr elements; + + protected: + void decodeDictionaryImpl() override; }; struct UnionVectorBatch : public ColumnVectorBatch { @@ -327,6 +350,9 @@ namespace orc { // the sub-columns std::vector children; + + protected: + void decodeDictionaryImpl() override; }; struct Decimal { diff --git a/c++/include/orc/Writer.hh b/c++/include/orc/Writer.hh index 7968fbce7f..78f06739bc 100644 --- a/c++/include/orc/Writer.hh +++ b/c++/include/orc/Writer.hh @@ -277,6 +277,32 @@ namespace orc { * @return if not set, return default value which is 1 MB. */ uint64_t getOutputBufferCapacity() const; + + /** + * Set the initial block size of original input buffer in the class CompressionStream. + * the input buffer is used to store raw data before compression, while the output buffer is + * dedicated to holding compressed data + */ + WriterOptions& setMemoryBlockSize(uint64_t capacity); + + /** + * Get the initial block size of original input buffer in the class CompressionStream. + * @return if not set, return default value which is 64 KB. + */ + uint64_t getMemoryBlockSize() const; + + /** + * Set whether the compression block should be aligned to row group boundary. + * The boolean type may not be aligned to row group boundary due to the + * requirement of the Boolean RLE encoder to pack input bits into bytes + */ + WriterOptions& setAlignBlockBoundToRowGroup(bool alignBlockBoundToRowGroup); + + /** + * Get if the compression block should be aligned to row group boundary. + * @return if not set, return default value which is false. + */ + bool getAlignBlockBoundToRowGroup() const; }; class Writer { diff --git a/c++/orcConfig.cmake.in b/c++/orcConfig.cmake.in new file mode 100644 index 0000000000..49663b3423 --- /dev/null +++ b/c++/orcConfig.cmake.in @@ -0,0 +1,103 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# This config sets the following variables in your project: +# +# orc_VERSION - version of the found ORC +# orc_FOUND - true if ORC found on the system +# +# This config sets the following targets in your project: +# +# orc::orc - for linked as static library +# +# For backward compatibility, this config also sets the following variables: +# +# ORC_FOUND - same as orc_FOUND above +# ORC_STATIC_LIB - static library of the found ORC +# ORC_INCLUDE_DIR - include directory of the found ORC +# ORC_INCLUDE_DIRS - same as ORC_INCLUDE_DIR above + +@PACKAGE_INIT@ + +set(ORC_VENDOR_DEPENDENCIES "@ORC_VENDOR_DEPENDENCIES@") +set(ORC_SYSTEM_DEPENDENCIES "@ORC_SYSTEM_DEPENDENCIES@") + +if(DEFINED CMAKE_MODULE_PATH) + set(ORC_CMAKE_MODULE_PATH_OLD ${CMAKE_MODULE_PATH}) +else() + unset(ORC_CMAKE_MODULE_PATH_OLD) +endif() +set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}") + +include(CMakeFindDependencyMacro) +foreach(dependency ${ORC_SYSTEM_DEPENDENCIES}) + find_dependency(${dependency}) +endforeach() + +if(DEFINED ORC_CMAKE_MODULE_PATH_OLD) + set(CMAKE_MODULE_PATH ${ORC_CMAKE_MODULE_PATH_OLD}) + unset(ORC_CMAKE_MODULE_PATH_OLD) +else() + unset(CMAKE_MODULE_PATH) +endif() + +include("${CMAKE_CURRENT_LIST_DIR}/orcTargets.cmake") + +get_target_property(orc_static_configurations orc::orc IMPORTED_CONFIGURATIONS) + +foreach(dependency ${ORC_VENDOR_DEPENDENCIES}) + string(REPLACE "|" ";" dependency_pair ${dependency}) + list(LENGTH dependency_pair dependency_pair_length) + if(NOT dependency_pair_length EQUAL 2) + message(FATAL_ERROR "Invalid vendor dependency: ${dependency}") + endif() + list(GET dependency_pair 0 target_name) + list(GET dependency_pair 1 static_lib_name) + + add_library("${target_name}" STATIC IMPORTED) + + foreach(CONFIGURATION ${orc_static_configurations}) + string(TOUPPER "${CONFIGURATION}" CONFIGURATION) + get_target_property(orc_static_location orc::orc LOCATION_${CONFIGURATION}) + get_filename_component(orc_lib_dir "${orc_static_location}" DIRECTORY) + set_property(TARGET "${target_name}" + APPEND + PROPERTY IMPORTED_CONFIGURATIONS ${CONFIGURATION}) + set_target_properties("${target_name}" + PROPERTIES IMPORTED_LOCATION_${CONFIGURATION} + "${orc_lib_dir}/${static_lib_name}") + endforeach() +endforeach() + +check_required_components(orc) + +foreach(BUILD_TYPE_SUFFIX + "_RELEASE" + "_RELWITHDEBINFO" + "_MINSIZEREL" + "_DEBUG" + "") + if(NOT ORC_STATIC_LIB) + get_target_property(ORC_STATIC_LIB orc::orc IMPORTED_LOCATION${BUILD_TYPE_SUFFIX}) + endif() +endforeach() + +get_target_property(ORC_INCLUDE_DIR orc::orc INTERFACE_INCLUDE_DIRECTORIES) + +set(ORC_FOUND TRUE) +set(ORC_VERSION ${orc_VERSION}) +set(ORC_INCLUDE_DIRS ${ORC_INCLUDE_DIR}) diff --git a/c++/src/BlockBuffer.hh b/c++/src/BlockBuffer.hh index 2faf38f7f9..6d265b0e32 100644 --- a/c++/src/BlockBuffer.hh +++ b/c++/src/BlockBuffer.hh @@ -106,12 +106,14 @@ namespace orc { } void resize(uint64_t size); + /** * Requests the BlockBuffer to contain at least newCapacity bytes. * Reallocation happens if there is need of more space. * @param newCapacity new capacity of BlockBuffer */ void reserve(uint64_t newCapacity); + /** * Write the BlockBuffer content into OutputStream * @param output the output stream to write to diff --git a/c++/src/ByteRLE.cc b/c++/src/ByteRLE.cc index bdbaad1da6..ded9f55a00 100644 --- a/c++/src/ByteRLE.cc +++ b/c++/src/ByteRLE.cc @@ -63,6 +63,8 @@ namespace orc { virtual void suppress() override; + virtual void finishEncode() override; + /** * Reset to initial state */ @@ -186,16 +188,17 @@ namespace orc { void ByteRleEncoderImpl::recordPosition(PositionRecorder* recorder) const { uint64_t flushedSize = outputStream->getSize(); - uint64_t unflushedSize = static_cast(bufferPosition); + uint64_t unusedBufferSize = static_cast(bufferLength - bufferPosition); if (outputStream->isCompressed()) { // start of the compression chunk in the stream recorder->add(flushedSize); - // number of decompressed bytes that need to be consumed - recorder->add(unflushedSize); + // There are multiple blocks in the input buffer, but bufferPosition only records the + // effective length of the last block. We need rawInputBufferSize to record the total length + // of all variable blocks. + recorder->add(outputStream->getRawInputBufferSize() - unusedBufferSize); } else { - flushedSize -= static_cast(bufferLength); // byte offset of the RLE run’s start location - recorder->add(flushedSize + unflushedSize); + recorder->add(flushedSize - unusedBufferSize); } recorder->add(static_cast(numLiterals)); } @@ -215,6 +218,13 @@ namespace orc { reset(); } + void ByteRleEncoderImpl::finishEncode() { + writeValues(); + outputStream->BackUp(bufferLength - bufferPosition); + outputStream->finishStream(); + bufferLength = bufferPosition = 0; + } + std::unique_ptr createByteRleEncoder( std::unique_ptr output) { return std::make_unique(std::move(output)); diff --git a/c++/src/ByteRLE.hh b/c++/src/ByteRLE.hh index bd19f52ecc..bee064f666 100644 --- a/c++/src/ByteRLE.hh +++ b/c++/src/ByteRLE.hh @@ -59,6 +59,13 @@ namespace orc { * suppress the data and reset to initial state */ virtual void suppress() = 0; + + /** + * Finalize the encoding process. This function should be called after all data required for + * encoding has been added. It ensures that any remaining data is processed and the final state + * of the encoder is set. + */ + virtual void finishEncode() = 0; }; class ByteRleDecoder { diff --git a/c++/src/CMakeLists.txt b/c++/src/CMakeLists.txt index 33ad584840..e378429f1e 100644 --- a/c++/src/CMakeLists.txt +++ b/c++/src/CMakeLists.txt @@ -138,12 +138,6 @@ configure_file ( "${CMAKE_CURRENT_BINARY_DIR}/Adaptor.hh" ) -include_directories ( - ${CMAKE_CURRENT_SOURCE_DIR} - ${CMAKE_CURRENT_BINARY_DIR} - ${LIBHDFSPP_INCLUDE_DIR} - ) - add_custom_command(OUTPUT orc_proto.pb.h orc_proto.pb.cc COMMAND ${PROTOBUF_EXECUTABLE} -I ../../orc-format_ep-prefix/src/orc-format_ep/src/main/proto/orc/proto @@ -156,6 +150,7 @@ set(SOURCE_FILES orc_proto.pb.h io/InputStream.cc io/OutputStream.cc + io/Cache.cc sargs/ExpressionTree.cc sargs/Literal.cc sargs/PredicateLeaf.cc @@ -197,7 +192,6 @@ set(SOURCE_FILES if(BUILD_LIBHDFSPP) set(SOURCE_FILES ${SOURCE_FILES} OrcHdfsFile.cc) - add_definitions(-DBUILD_LIBHDFSPP) endif(BUILD_LIBHDFSPP) if(BUILD_ENABLE_AVX512) @@ -209,14 +203,45 @@ endif(BUILD_ENABLE_AVX512) add_library (orc STATIC ${SOURCE_FILES}) target_link_libraries (orc - orc::protobuf - orc::zlib - orc::snappy - orc::lz4 - orc::zstd - ${LIBHDFSPP_LIBRARIES} + INTERFACE + ${ORC_INSTALL_INTERFACE_TARGETS} + PRIVATE + $ + $ + $ + $ + $ + $ ) +target_include_directories (orc + INTERFACE + $ + PUBLIC + $ + $ + PRIVATE + ${CMAKE_CURRENT_BINARY_DIR} + ${CMAKE_CURRENT_SOURCE_DIR} + ${LIBHDFSPP_INCLUDE_DIR} +) + +if (BUILD_LIBHDFSPP) + target_compile_definitions(orc PUBLIC -DBUILD_LIBHDFSPP) +endif (BUILD_LIBHDFSPP) + +if (BUILD_CPP_ENABLE_METRICS) + message(STATUS "Enable the metrics collection") + target_compile_definitions(orc PUBLIC ENABLE_METRICS=1) +else () + message(STATUS "Disable the metrics collection") + target_compile_definitions(orc PUBLIC ENABLE_METRICS=0) +endif () + add_dependencies(orc orc-format_ep) -install(TARGETS orc DESTINATION lib) +install(TARGETS orc EXPORT orc_targets) +install(EXPORT orc_targets + DESTINATION ${ORC_INSTALL_CMAKE_DIR} + NAMESPACE "orc::" + FILE "orcTargets.cmake") diff --git a/c++/src/ColumnReader.cc b/c++/src/ColumnReader.cc index e70f916ffd..af434c37ca 100644 --- a/c++/src/ColumnReader.cc +++ b/c++/src/ColumnReader.cc @@ -395,7 +395,7 @@ namespace orc { int64_t bits = 0; if (bufferEnd_ - bufferPointer_ >= 8) { if (isLittleEndian) { - bits = *(reinterpret_cast(bufferPointer_)); + memcpy(&bits, bufferPointer_, sizeof(bits)); } else { bits = static_cast(static_cast(bufferPointer_[0])); bits |= static_cast(static_cast(bufferPointer_[1])) << 8; @@ -509,8 +509,10 @@ namespace orc { bufferNum = std::min(numValues, static_cast(bufferEnd_ - bufferPointer_) / bytesPerValue_); uint64_t bufferBytes = bufferNum * bytesPerValue_; - memcpy(outArray, bufferPointer_, bufferBytes); - bufferPointer_ += bufferBytes; + if (bufferBytes > 0) { + memcpy(outArray, bufferPointer_, bufferBytes); + bufferPointer_ += bufferBytes; + } } for (size_t i = bufferNum; i < numValues; ++i) { outArray[i] = readDouble(); diff --git a/c++/src/ColumnWriter.cc b/c++/src/ColumnWriter.cc index 86e30ce90d..d31b1c65d4 100644 --- a/c++/src/ColumnWriter.cc +++ b/c++/src/ColumnWriter.cc @@ -24,6 +24,7 @@ #include "RLE.hh" #include "Statistics.hh" #include "Timezone.hh" +#include "Utils.hh" namespace orc { StreamsFactory::~StreamsFactory() { @@ -47,11 +48,11 @@ namespace orc { // In the future, we can decide compression strategy and modifier // based on stream kind. But for now we just use the setting from // WriterOption - return createCompressor(options_.getCompression(), outStream_, - options_.getCompressionStrategy(), - // BufferedOutputStream initial capacity - options_.getOutputBufferCapacity(), options_.getCompressionBlockSize(), - *options_.getMemoryPool(), options_.getWriterMetrics()); + return createCompressor( + options_.getCompression(), outStream_, options_.getCompressionStrategy(), + // BufferedOutputStream initial capacity + options_.getOutputBufferCapacity(), options_.getCompressionBlockSize(), + options_.getMemoryBlockSize(), *options_.getMemoryPool(), options_.getWriterMetrics()); } std::unique_ptr createStreamsFactory(const WriterOptions& options, @@ -253,6 +254,10 @@ namespace orc { // PASS } + void ColumnWriter::finishStreams() { + notNullEncoder->finishEncode(); + } + class StructColumnWriter : public ColumnWriter { public: StructColumnWriter(const Type& type, const StreamsFactory& factory, @@ -282,6 +287,8 @@ namespace orc { virtual void reset() override; + virtual void finishStreams() override; + private: std::vector> children_; }; @@ -415,6 +422,13 @@ namespace orc { } } + void StructColumnWriter::finishStreams() { + ColumnWriter::finishStreams(); + for (uint32_t i = 0; i < children_.size(); ++i) { + children_[i]->finishStreams(); + } + } + template class IntegerColumnWriter : public ColumnWriter { public: @@ -432,6 +446,8 @@ namespace orc { virtual void recordPosition() const override; + virtual void finishStreams() override; + protected: std::unique_ptr rleEncoder; @@ -527,6 +543,12 @@ namespace orc { rleEncoder->recordPosition(rowIndexPosition.get()); } + template + void IntegerColumnWriter::finishStreams() { + ColumnWriter::finishStreams(); + rleEncoder->finishEncode(); + } + template class ByteColumnWriter : public ColumnWriter { public: @@ -543,6 +565,8 @@ namespace orc { virtual void recordPosition() const override; + virtual void finishStreams() override; + private: std::unique_ptr byteRleEncoder_; }; @@ -591,7 +615,7 @@ namespace orc { if (enableBloomFilter) { bloomFilter->addLong(data[i]); } - intStats->update(static_cast(byteData[i]), 1); + intStats->update(static_cast(static_cast(byteData[i])), 1); } } intStats->increase(count); @@ -636,6 +660,12 @@ namespace orc { byteRleEncoder_->recordPosition(rowIndexPosition.get()); } + template + void ByteColumnWriter::finishStreams() { + ColumnWriter::finishStreams(); + byteRleEncoder_->finishEncode(); + } + template class BooleanColumnWriter : public ColumnWriter { public: @@ -653,6 +683,8 @@ namespace orc { virtual void recordPosition() const override; + virtual void finishStreams() override; + private: std::unique_ptr rleEncoder_; }; @@ -749,6 +781,12 @@ namespace orc { rleEncoder_->recordPosition(rowIndexPosition.get()); } + template + void BooleanColumnWriter::finishStreams() { + ColumnWriter::finishStreams(); + rleEncoder_->finishEncode(); + } + template class FloatingColumnWriter : public ColumnWriter { public: @@ -766,6 +804,8 @@ namespace orc { virtual void recordPosition() const override; + virtual void finishStreams() override; + private: bool isFloat_; std::unique_ptr dataStream_; @@ -877,6 +917,12 @@ namespace orc { dataStream_->recordPosition(rowIndexPosition.get()); } + template + void FloatingColumnWriter::finishStreams() { + ColumnWriter::finishStreams(); + dataStream_->finishStream(); + } + /** * Implementation of increasing sorted string dictionary */ @@ -888,10 +934,17 @@ namespace orc { size_t length; }; + struct DictEntryWithIndex { + DictEntryWithIndex(const char* str, size_t len, size_t index) + : entry(str, len), index(index) {} + DictEntry entry; + size_t index; + }; + SortedStringDictionary() : totalLength_(0) {} // insert a new string into dictionary, return its insertion order - size_t insert(const char* data, size_t len); + size_t insert(const char* str, size_t len); // write dictionary data & length to output buffer void flush(AppendOnlyBufferedStream* dataStream, RleEncoder* lengthEncoder) const; @@ -912,7 +965,9 @@ namespace orc { private: struct LessThan { - bool operator()(const DictEntry& left, const DictEntry& right) const { + bool operator()(const DictEntryWithIndex& l, const DictEntryWithIndex& r) { + const auto& left = l.entry; + const auto& right = r.entry; int ret = memcmp(left.data, right.data, std::min(left.length, right.length)); if (ret != 0) { return ret < 0; @@ -921,8 +976,8 @@ namespace orc { } }; - std::map dict_; - std::vector> data_; + mutable std::vector flatDict_; + std::unordered_map keyToIndex_; uint64_t totalLength_; // use friend class here to avoid being bothered by const function calls @@ -935,14 +990,10 @@ namespace orc { // insert a new string into dictionary, return its insertion order size_t SortedStringDictionary::insert(const char* str, size_t len) { - auto ret = dict_.insert({DictEntry(str, len), dict_.size()}); + size_t index = flatDict_.size(); + auto ret = keyToIndex_.emplace(std::string(str, len), index); if (ret.second) { - // make a copy to internal storage - data_.push_back(std::vector(len)); - memcpy(data_.back().data(), str, len); - // update dictionary entry to link pointer to internal storage - DictEntry* entry = const_cast(&(ret.first->first)); - entry->data = data_.back().data(); + flatDict_.emplace_back(ret.first->first.data(), ret.first->first.size(), index); totalLength_ += len; } return ret.first->second; @@ -951,9 +1002,12 @@ namespace orc { // write dictionary data & length to output buffer void SortedStringDictionary::flush(AppendOnlyBufferedStream* dataStream, RleEncoder* lengthEncoder) const { - for (auto it = dict_.cbegin(); it != dict_.cend(); ++it) { - dataStream->write(it->first.data, it->first.length); - lengthEncoder->write(static_cast(it->first.length)); + std::sort(flatDict_.begin(), flatDict_.end(), LessThan()); + + for (const auto& entryWithIndex : flatDict_) { + const auto& entry = entryWithIndex.entry; + dataStream->write(entry.data, entry.length); + lengthEncoder->write(static_cast(entry.length)); } } @@ -969,10 +1023,9 @@ namespace orc { */ void SortedStringDictionary::reorder(std::vector& idxBuffer) const { // iterate the dictionary to get mapping from insertion order to value order - std::vector mapping(dict_.size()); - size_t dictIdx = 0; - for (auto it = dict_.cbegin(); it != dict_.cend(); ++it) { - mapping[it->second] = dictIdx++; + std::vector mapping(flatDict_.size()); + for (size_t i = 0; i < flatDict_.size(); ++i) { + mapping[flatDict_[i].index] = i; } // do the transformation @@ -984,15 +1037,20 @@ namespace orc { // get dict entries in insertion order void SortedStringDictionary::getEntriesInInsertionOrder( std::vector& entries) const { - entries.resize(dict_.size()); - for (auto it = dict_.cbegin(); it != dict_.cend(); ++it) { - entries[it->second] = &(it->first); + std::sort(flatDict_.begin(), flatDict_.end(), + [](const DictEntryWithIndex& left, const DictEntryWithIndex& right) { + return left.index < right.index; + }); + + entries.resize(flatDict_.size()); + for (size_t i = 0; i < flatDict_.size(); ++i) { + entries[i] = &(flatDict_[i].entry); } } // return count of entries size_t SortedStringDictionary::size() const { - return dict_.size(); + return flatDict_.size(); } // return total length of strings in the dictioanry @@ -1002,8 +1060,8 @@ namespace orc { void SortedStringDictionary::clear() { totalLength_ = 0; - data_.clear(); - dict_.clear(); + keyToIndex_.clear(); + flatDict_.clear(); } class StringColumnWriter : public ColumnWriter { @@ -1028,6 +1086,8 @@ namespace orc { virtual void reset() override; + virtual void finishStreams() override; + private: /** * dictionary related functions @@ -1221,6 +1281,14 @@ namespace orc { } } + void StringColumnWriter::finishStreams() { + ColumnWriter::finishStreams(); + if (!useDictionary) { + directDataStream->finishStream(); + directLengthEncoder->finishEncode(); + } + } + bool StringColumnWriter::checkDictionaryKeyRatio() { if (!doneDictionaryCheck) { useDictionary = dictionary.size() <= @@ -1356,75 +1424,6 @@ namespace orc { deleteDictStreams(); } - struct Utf8Utils { - /** - * Counts how many utf-8 chars of the input data - */ - static uint64_t charLength(const char* data, uint64_t length) { - uint64_t chars = 0; - for (uint64_t i = 0; i < length; i++) { - if (isUtfStartByte(data[i])) { - chars++; - } - } - return chars; - } - - /** - * Return the number of bytes required to read at most maxCharLength - * characters in full from a utf-8 encoded byte array provided - * by data. This does not validate utf-8 data, but - * operates correctly on already valid utf-8 data. - * - * @param maxCharLength number of characters required - * @param data the bytes of UTF-8 - * @param length the length of data to truncate - */ - static uint64_t truncateBytesTo(uint64_t maxCharLength, const char* data, uint64_t length) { - uint64_t chars = 0; - if (length <= maxCharLength) { - return length; - } - for (uint64_t i = 0; i < length; i++) { - if (isUtfStartByte(data[i])) { - chars++; - } - if (chars > maxCharLength) { - return i; - } - } - // everything fits - return length; - } - - /** - * Checks if b is the first byte of a UTF-8 character. - */ - inline static bool isUtfStartByte(char b) { - return (b & 0xC0) != 0x80; - } - - /** - * Find the start of the last character that ends in the current string. - * @param text the bytes of the utf-8 - * @param from the first byte location - * @param until the last byte location - * @return the index of the last character - */ - static uint64_t findLastCharacter(const char* text, uint64_t from, uint64_t until) { - uint64_t posn = until; - /* we don't expect characters more than 5 bytes */ - while (posn >= from) { - if (isUtfStartByte(text[posn])) { - return posn; - } - posn -= 1; - } - /* beginning of a valid char not found */ - throw std::logic_error("Could not truncate string, beginning of a valid char not found"); - } - }; - class CharColumnWriter : public StringColumnWriter { public: CharColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options) @@ -1639,6 +1638,8 @@ namespace orc { virtual void recordPosition() const override; + virtual void finishStreams() override; + protected: std::unique_ptr secRleEncoder, nanoRleEncoder; @@ -1779,6 +1780,12 @@ namespace orc { nanoRleEncoder->recordPosition(rowIndexPosition.get()); } + void TimestampColumnWriter::finishStreams() { + ColumnWriter::finishStreams(); + secRleEncoder->finishEncode(); + nanoRleEncoder->finishEncode(); + } + class DateColumnWriter : public IntegerColumnWriter { public: DateColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options); @@ -1848,6 +1855,8 @@ namespace orc { virtual void recordPosition() const override; + virtual void finishStreams() override; + protected: RleVersion rleVersion; uint64_t precision; @@ -1966,6 +1975,12 @@ namespace orc { scaleEncoder->recordPosition(rowIndexPosition.get()); } + void Decimal64ColumnWriter::finishStreams() { + ColumnWriter::finishStreams(); + valueStream->finishStream(); + scaleEncoder->finishEncode(); + } + class Decimal64ColumnWriterV2 : public ColumnWriter { public: Decimal64ColumnWriterV2(const Type& type, const StreamsFactory& factory, @@ -1982,6 +1997,8 @@ namespace orc { virtual void recordPosition() const override; + virtual void finishStreams() override; + protected: uint64_t precision; uint64_t scale; @@ -2072,6 +2089,11 @@ namespace orc { valueEncoder->recordPosition(rowIndexPosition.get()); } + void Decimal64ColumnWriterV2::finishStreams() { + ColumnWriter::finishStreams(); + valueEncoder->finishEncode(); + } + class Decimal128ColumnWriter : public Decimal64ColumnWriter { public: Decimal128ColumnWriter(const Type& type, const StreamsFactory& factory, @@ -2187,6 +2209,8 @@ namespace orc { virtual void reset() override; + virtual void finishStreams() override; + private: std::unique_ptr lengthEncoder_; RleVersion rleVersion_; @@ -2363,6 +2387,14 @@ namespace orc { } } + void ListColumnWriter::finishStreams() { + ColumnWriter::finishStreams(); + lengthEncoder_->finishEncode(); + if (child_) { + child_->finishStreams(); + } + } + class MapColumnWriter : public ColumnWriter { public: MapColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options); @@ -2395,6 +2427,8 @@ namespace orc { virtual void reset() override; + virtual void finishStreams() override; + private: std::unique_ptr keyWriter_; std::unique_ptr elemWriter_; @@ -2613,6 +2647,17 @@ namespace orc { } } + void MapColumnWriter::finishStreams() { + ColumnWriter::finishStreams(); + lengthEncoder_->finishEncode(); + if (keyWriter_) { + keyWriter_->finishStreams(); + } + if (elemWriter_) { + elemWriter_->finishStreams(); + } + } + class UnionColumnWriter : public ColumnWriter { public: UnionColumnWriter(const Type& type, const StreamsFactory& factory, @@ -2645,6 +2690,8 @@ namespace orc { virtual void reset() override; + virtual void finishStreams() override; + private: std::unique_ptr rleEncoder_; std::vector> children_; @@ -2816,6 +2863,14 @@ namespace orc { } } + void UnionColumnWriter::finishStreams() { + ColumnWriter::finishStreams(); + rleEncoder_->finishEncode(); + for (uint32_t i = 0; i < children_.size(); ++i) { + children_[i]->finishStreams(); + } + } + std::unique_ptr buildWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options) { switch (static_cast(type.getKind())) { diff --git a/c++/src/ColumnWriter.hh b/c++/src/ColumnWriter.hh index 8afd1eb72c..1c5e15d707 100644 --- a/c++/src/ColumnWriter.hh +++ b/c++/src/ColumnWriter.hh @@ -179,6 +179,18 @@ namespace orc { */ virtual void writeDictionary(); + /** + * Finalize the encoding and compressing process. This function should be + * called after all data required for encoding has been added. It ensures + * that any remaining data is processed and the final state of the streams + * is set. + * Note: boolean type cannot cut off the current byte if it is not filled + * with 8 bits, otherwise Boolean RLE may incorrectly read the unfilled + * trailing bits. In this case, the last byte will be the head of the next + * compression block. + */ + virtual void finishStreams(); + protected: /** * Utility function to translate ColumnStatistics into protobuf form and diff --git a/c++/src/Compression.cc b/c++/src/Compression.cc index 4002276e18..f373a75bff 100644 --- a/c++/src/Compression.cc +++ b/c++/src/Compression.cc @@ -52,19 +52,22 @@ namespace orc { class CompressionStreamBase : public BufferedOutputStream { public: CompressionStreamBase(OutputStream* outStream, int compressionLevel, uint64_t capacity, - uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics); + uint64_t compressionBlockSize, uint64_t memoryBlockSize, MemoryPool& pool, + WriterMetrics* metrics); virtual bool Next(void** data, int* size) override = 0; - virtual void BackUp(int count) override; + virtual void BackUp(int count) override = 0; virtual std::string getName() const override = 0; - virtual uint64_t flush() override; - virtual void suppress() override; + virtual uint64_t flush() override = 0; + virtual void suppress() override = 0; virtual bool isCompressed() const override { return true; } virtual uint64_t getSize() const override; + virtual uint64_t getRawInputBufferSize() const override = 0; + virtual void finishStream() override = 0; protected: void writeData(const unsigned char* data, int size); @@ -78,9 +81,6 @@ namespace orc { // ensure enough room for compression block header void ensureHeader(); - // Buffer to hold uncompressed data until user calls Next() - DataBuffer rawInputBuffer; - // Compress level int level; @@ -99,46 +99,26 @@ namespace orc { // Compression block header pointer array static const uint32_t HEADER_SIZE = 3; std::array header; + + // Compression block size + uint64_t compressionBlockSize; }; CompressionStreamBase::CompressionStreamBase(OutputStream* outStream, int compressionLevel, - uint64_t capacity, uint64_t blockSize, - MemoryPool& pool, WriterMetrics* metrics) - : BufferedOutputStream(pool, outStream, capacity, blockSize, metrics), - rawInputBuffer(pool, blockSize), + uint64_t capacity, uint64_t compressionBlockSize, + uint64_t memoryBlockSize, MemoryPool& pool, + WriterMetrics* metrics) + : BufferedOutputStream(pool, outStream, capacity, memoryBlockSize, metrics), level(compressionLevel), outputBuffer(nullptr), bufferSize(0), outputPosition(0), - outputSize(0) { + outputSize(0), + compressionBlockSize(compressionBlockSize) { // init header pointer array header.fill(nullptr); } - void CompressionStreamBase::BackUp(int count) { - if (count > bufferSize) { - throw std::logic_error("Can't backup that much!"); - } - bufferSize -= count; - } - - uint64_t CompressionStreamBase::flush() { - void* data; - int size; - if (!Next(&data, &size)) { - throw std::runtime_error("Failed to flush compression buffer."); - } - BufferedOutputStream::BackUp(outputSize - outputPosition); - bufferSize = outputSize = outputPosition = 0; - return BufferedOutputStream::flush(); - } - - void CompressionStreamBase::suppress() { - outputBuffer = nullptr; - bufferSize = outputPosition = outputSize = 0; - BufferedOutputStream::suppress(); - } - uint64_t CompressionStreamBase::getSize() const { return BufferedOutputStream::getSize() - static_cast(outputSize - outputPosition); } @@ -149,12 +129,12 @@ namespace orc { while (offset < size) { if (outputPosition == outputSize) { if (!BufferedOutputStream::Next(reinterpret_cast(&outputBuffer), &outputSize)) { - throw std::runtime_error("Failed to get next output buffer from output stream."); + throw CompressionError("Failed to get next output buffer from output stream."); } outputPosition = 0; } else if (outputPosition > outputSize) { // for safety this will unlikely happen - throw std::logic_error("Write to an out-of-bound place during compression!"); + throw CompressionError("Write to an out-of-bound place during compression!"); } int currentSize = std::min(outputSize - outputPosition, size - offset); memcpy(outputBuffer + outputPosition, data + offset, static_cast(currentSize)); @@ -168,7 +148,7 @@ namespace orc { for (uint32_t i = 0; i < HEADER_SIZE; ++i) { if (outputPosition >= outputSize) { if (!BufferedOutputStream::Next(reinterpret_cast(&outputBuffer), &outputSize)) { - throw std::runtime_error("Failed to get next output buffer from output stream."); + throw CompressionError("Failed to get next output buffer from output stream."); } outputPosition = 0; } @@ -183,31 +163,74 @@ namespace orc { class CompressionStream : public CompressionStreamBase { public: CompressionStream(OutputStream* outStream, int compressionLevel, uint64_t capacity, - uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics); + uint64_t compressionBlockSize, uint64_t memoryBlockSize, MemoryPool& pool, + WriterMetrics* metrics); virtual bool Next(void** data, int* size) override; virtual std::string getName() const override = 0; + virtual void BackUp(int count) override; + virtual void suppress() override; + virtual uint64_t flush() override; + uint64_t getRawInputBufferSize() const override { + return rawInputBuffer.size(); + } + virtual void finishStream() override { + compressInternal(); + BufferedOutputStream::finishStream(); + } protected: // return total compressed size virtual uint64_t doStreamingCompression() = 0; + + // Buffer to hold uncompressed data until user calls Next() + BlockBuffer rawInputBuffer; + + void compressInternal(); }; + void CompressionStream::BackUp(int count) { + uint64_t backup = static_cast(count); + uint64_t currSize = rawInputBuffer.size(); + if (backup > currSize) { + throw CompressionError("Can't backup that much!"); + } + rawInputBuffer.resize(currSize - backup); + } + + uint64_t CompressionStream::flush() { + compressInternal(); + BufferedOutputStream::BackUp(outputSize - outputPosition); + rawInputBuffer.resize(0); + outputSize = outputPosition = 0; + return BufferedOutputStream::flush(); + } + + void CompressionStream::suppress() { + outputBuffer = nullptr; + outputPosition = outputSize = 0; + rawInputBuffer.resize(0); + BufferedOutputStream::suppress(); + } + CompressionStream::CompressionStream(OutputStream* outStream, int compressionLevel, - uint64_t capacity, uint64_t blockSize, MemoryPool& pool, + uint64_t capacity, uint64_t compressionBlockSize, + uint64_t memoryBlockSize, MemoryPool& pool, WriterMetrics* metrics) - : CompressionStreamBase(outStream, compressionLevel, capacity, blockSize, pool, metrics) { + : CompressionStreamBase(outStream, compressionLevel, capacity, compressionBlockSize, + memoryBlockSize, pool, metrics), + rawInputBuffer(pool, memoryBlockSize) { // PASS } - bool CompressionStream::Next(void** data, int* size) { - if (bufferSize != 0) { + void CompressionStream::compressInternal() { + if (rawInputBuffer.size() != 0) { ensureHeader(); uint64_t preSize = getSize(); uint64_t totalCompressedSize = doStreamingCompression(); - if (totalCompressedSize >= static_cast(bufferSize)) { - writeHeader(static_cast(bufferSize), true); + if (totalCompressedSize >= static_cast(rawInputBuffer.size())) { + writeHeader(static_cast(rawInputBuffer.size()), true); // reset output buffer outputBuffer = nullptr; outputPosition = outputSize = 0; @@ -215,23 +238,42 @@ namespace orc { BufferedOutputStream::BackUp(static_cast(backup)); // copy raw input buffer into block buffer - writeData(rawInputBuffer.data(), bufferSize); + uint64_t blockNumber = rawInputBuffer.getBlockNumber(); + for (uint64_t i = 0; i < blockNumber; ++i) { + auto block = rawInputBuffer.getBlock(i); + writeData(reinterpret_cast(block.data), block.size); + } } else { writeHeader(totalCompressedSize, false); } + rawInputBuffer.resize(0); } + } - *data = rawInputBuffer.data(); - *size = static_cast(rawInputBuffer.size()); - bufferSize = *size; + bool CompressionStream::Next(void** data, int* size) { + if (rawInputBuffer.size() > compressionBlockSize) { + std::stringstream ss; + ss << "uncompressed data size " << rawInputBuffer.size() + << " is larger than compression block size " << compressionBlockSize; + throw CompressionError(ss.str()); + } + + // compress data in the rawInputBuffer when it is full + if (rawInputBuffer.size() == compressionBlockSize) { + compressInternal(); + } + auto block = rawInputBuffer.getNextBlock(); + *data = block.data; + *size = static_cast(block.size); return true; } class ZlibCompressionStream : public CompressionStream { public: - ZlibCompressionStream(OutputStream* outStream, int compressionLevel, uint64_t capacity, - uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics); + ZlibCompressionStream(OutputStream* outStream, int compressionLevel, uint64_t bufferCapacity, + uint64_t compressionBlockSize, uint64_t memoryBlockSize, MemoryPool& pool, + WriterMetrics* metrics); virtual ~ZlibCompressionStream() override { end(); @@ -249,42 +291,57 @@ namespace orc { }; ZlibCompressionStream::ZlibCompressionStream(OutputStream* outStream, int compressionLevel, - uint64_t capacity, uint64_t blockSize, - MemoryPool& pool, WriterMetrics* metrics) - : CompressionStream(outStream, compressionLevel, capacity, blockSize, pool, metrics) { + uint64_t bufferCapacity, + uint64_t compressionBlockSize, + uint64_t memoryBlockSize, MemoryPool& pool, + WriterMetrics* metrics) + : CompressionStream(outStream, compressionLevel, bufferCapacity, compressionBlockSize, + memoryBlockSize, pool, metrics) { init(); } uint64_t ZlibCompressionStream::doStreamingCompression() { if (deflateReset(&strm_) != Z_OK) { - throw std::runtime_error("Failed to reset inflate."); + throw CompressionError("Failed to reset inflate."); } - strm_.avail_in = static_cast(bufferSize); - strm_.next_in = rawInputBuffer.data(); + // iterate through all blocks + uint64_t blockId = 0; + bool finish = false; do { - if (outputPosition >= outputSize) { - if (!BufferedOutputStream::Next(reinterpret_cast(&outputBuffer), &outputSize)) { - throw std::runtime_error("Failed to get next output buffer from output stream."); - } - outputPosition = 0; + if (blockId == rawInputBuffer.getBlockNumber()) { + finish = true; + strm_.avail_in = 0; + strm_.next_in = nullptr; + } else { + auto block = rawInputBuffer.getBlock(blockId++); + strm_.avail_in = static_cast(block.size); + strm_.next_in = reinterpret_cast(block.data); } - strm_.next_out = reinterpret_cast(outputBuffer + outputPosition); - strm_.avail_out = static_cast(outputSize - outputPosition); - int ret = deflate(&strm_, Z_FINISH); - outputPosition = outputSize - static_cast(strm_.avail_out); + do { + if (outputPosition >= outputSize) { + if (!BufferedOutputStream::Next(reinterpret_cast(&outputBuffer), &outputSize)) { + throw CompressionError("Failed to get next output buffer from output stream."); + } + outputPosition = 0; + } + strm_.next_out = reinterpret_cast(outputBuffer + outputPosition); + strm_.avail_out = static_cast(outputSize - outputPosition); - if (ret == Z_STREAM_END) { - break; - } else if (ret == Z_OK) { - // needs more buffer so will continue the loop - } else { - throw std::runtime_error("Failed to deflate input data."); - } - } while (strm_.avail_out == 0); + int ret = deflate(&strm_, finish ? Z_FINISH : Z_NO_FLUSH); + outputPosition = outputSize - static_cast(strm_.avail_out); + if (ret == Z_STREAM_END) { + break; + } else if (ret == Z_OK) { + // needs more buffer so will continue the loop + } else { + throw CompressionError("Failed to deflate input data."); + } + } while (strm_.avail_out == 0); + } while (!finish); return strm_.total_out; } @@ -305,7 +362,7 @@ namespace orc { strm_.next_in = nullptr; if (deflateInit2(&strm_, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY) != Z_OK) { - throw std::runtime_error("Error while calling deflateInit2() for zlib."); + throw CompressionError("Error while calling deflateInit2() for zlib."); } } @@ -505,7 +562,7 @@ namespace orc { } else if (state == DECOMPRESS_START) { NextDecompress(data, size, availableSize); } else { - throw std::logic_error( + throw CompressionError( "Unknown compression state in " "DecompressionStream::Next"); } @@ -519,7 +576,7 @@ namespace orc { void DecompressionStream::BackUp(int count) { if (outputBuffer == nullptr || outputBufferLength != 0) { - throw std::logic_error("Backup without previous Next in " + getName()); + throw CompressionError("Backup without previous Next in " + getName()); } outputBuffer -= static_cast(count); outputBufferLength = static_cast(count); @@ -647,13 +704,17 @@ namespace orc { case Z_OK: break; case Z_MEM_ERROR: - throw std::logic_error("Memory error from inflateInit2"); + throw CompressionError( + "Memory error from ZlibDecompressionStream::ZlibDecompressionStream inflateInit2"); case Z_VERSION_ERROR: - throw std::logic_error("Version error from inflateInit2"); + throw CompressionError( + "Version error from ZlibDecompressionStream::ZlibDecompressionStream inflateInit2"); case Z_STREAM_ERROR: - throw std::logic_error("Stream error from inflateInit2"); + throw CompressionError( + "Stream error from ZlibDecompressionStream::ZlibDecompressionStream inflateInit2"); default: - throw std::logic_error("Unknown error from inflateInit2"); + throw CompressionError( + "Unknown error from ZlibDecompressionStream::ZlibDecompressionStream inflateInit2"); } } @@ -674,7 +735,7 @@ namespace orc { zstream_.next_out = reinterpret_cast(const_cast(outputBuffer)); zstream_.avail_out = static_cast(outputDataBuffer.capacity()); if (inflateReset(&zstream_) != Z_OK) { - throw std::logic_error( + throw CompressionError( "Bad inflateReset in " "ZlibDecompressionStream::NextDecompress"); } @@ -694,19 +755,19 @@ namespace orc { case Z_STREAM_END: break; case Z_BUF_ERROR: - throw std::logic_error( + throw CompressionError( "Buffer error in " "ZlibDecompressionStream::NextDecompress"); case Z_DATA_ERROR: - throw std::logic_error( + throw CompressionError( "Data error in " "ZlibDecompressionStream::NextDecompress"); case Z_STREAM_ERROR: - throw std::logic_error( + throw CompressionError( "Stream error in " "ZlibDecompressionStream::NextDecompress"); default: - throw std::logic_error( + throw CompressionError( "Unknown error in " "ZlibDecompressionStream::NextDecompress"); } @@ -812,7 +873,7 @@ namespace orc { } if (outLength > maxOutputLength) { - throw std::logic_error("Snappy length exceeds block size"); + throw CompressionError("Snappy length exceeds block size"); } if (!snappy::RawUncompress(input, length, output)) { @@ -881,14 +942,23 @@ namespace orc { public: BlockCompressionStream(OutputStream* outStream, int compressionLevel, uint64_t capacity, uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics) - : CompressionStreamBase(outStream, compressionLevel, capacity, blockSize, pool, metrics), - compressorBuffer(pool) { + : CompressionStreamBase(outStream, compressionLevel, capacity, blockSize, blockSize, pool, + metrics), + compressorBuffer(pool), + rawInputBuffer(pool, blockSize) { // PASS } virtual bool Next(void** data, int* size) override; virtual void suppress() override; + virtual void BackUp(int count) override; + virtual uint64_t flush() override; virtual std::string getName() const override = 0; + uint64_t getRawInputBufferSize() const override { + return bufferSize; + } + + virtual void finishStream() override; protected: // compresses a block and returns the compressed size @@ -900,8 +970,23 @@ namespace orc { // should allocate max possible compressed size DataBuffer compressorBuffer; + + // Buffer to hold uncompressed data until user calls Next() + DataBuffer rawInputBuffer; }; + void BlockCompressionStream::BackUp(int count) { + if (count > bufferSize) { + throw CompressionError("Can't backup that much!"); + } + bufferSize -= count; + } + + uint64_t BlockCompressionStream::flush() { + finishStream(); + return BufferedOutputStream::flush(); + } + bool BlockCompressionStream::Next(void** data, int* size) { if (bufferSize != 0) { ensureHeader(); @@ -935,7 +1020,19 @@ namespace orc { void BlockCompressionStream::suppress() { compressorBuffer.resize(0); - CompressionStreamBase::suppress(); + outputBuffer = nullptr; + bufferSize = outputPosition = outputSize = 0; + BufferedOutputStream::suppress(); + } + + void BlockCompressionStream::finishStream() { + void* data; + int size; + if (!Next(&data, &size)) { + throw CompressionError("Failed to flush compression buffer."); + } + BufferedOutputStream::BackUp(outputSize - outputPosition); + bufferSize = outputSize = outputPosition = 0; } /** @@ -976,7 +1073,7 @@ namespace orc { reinterpret_cast(compressorBuffer.data()), bufferSize, static_cast(compressorBuffer.size()), level); if (result == 0) { - throw std::runtime_error("Error during block compression using lz4."); + throw CompressionError("Error during block compression using lz4."); } return static_cast(result); } @@ -984,7 +1081,7 @@ namespace orc { void Lz4CompressionSteam::init() { state_ = LZ4_createStream(); if (!state_) { - throw std::runtime_error("Error while allocating state for lz4."); + throw CompressionError("Error while allocating state for lz4."); } } @@ -1072,7 +1169,7 @@ namespace orc { void ZSTDCompressionStream::init() { cctx_ = ZSTD_createCCtx(); if (!cctx_) { - throw std::runtime_error("Error while calling ZSTD_createCCtx() for zstd."); + throw CompressionError("Error while calling ZSTD_createCCtx() for zstd."); } } @@ -1129,7 +1226,7 @@ namespace orc { void ZSTDDecompressionStream::init() { dctx_ = ZSTD_createDCtx(); if (!dctx_) { - throw std::runtime_error("Error while calling ZSTD_createDCtx() for zstd."); + throw CompressionError("Error while calling ZSTD_createDCtx() for zstd."); } } @@ -1140,12 +1237,10 @@ namespace orc { DIAGNOSTIC_PUSH - std::unique_ptr createCompressor(CompressionKind kind, - OutputStream* outStream, - CompressionStrategy strategy, - uint64_t bufferCapacity, - uint64_t compressionBlockSize, - MemoryPool& pool, WriterMetrics* metrics) { + std::unique_ptr createCompressor( + CompressionKind kind, OutputStream* outStream, CompressionStrategy strategy, + uint64_t bufferCapacity, uint64_t compressionBlockSize, uint64_t memoryBlockSize, + MemoryPool& pool, WriterMetrics* metrics) { switch (static_cast(kind)) { case CompressionKind_NONE: { return std::make_unique(pool, outStream, bufferCapacity, @@ -1154,8 +1249,8 @@ namespace orc { case CompressionKind_ZLIB: { int level = (strategy == CompressionStrategy_SPEED) ? Z_BEST_SPEED + 1 : Z_DEFAULT_COMPRESSION; - return std::make_unique(outStream, level, bufferCapacity, - compressionBlockSize, pool, metrics); + return std::make_unique( + outStream, level, bufferCapacity, compressionBlockSize, memoryBlockSize, pool, metrics); } case CompressionKind_ZSTD: { int level = (strategy == CompressionStrategy_SPEED) ? 1 : ZSTD_CLEVEL_DEFAULT; diff --git a/c++/src/Compression.hh b/c++/src/Compression.hh index 55b152dd63..24170c56b4 100644 --- a/c++/src/Compression.hh +++ b/c++/src/Compression.hh @@ -42,15 +42,16 @@ namespace orc { * @param outStream the output stream that is the underlying target * @param strategy compression strategy * @param bufferCapacity compression stream buffer total capacity - * @param compressionBlockSize compression buffer block size + * @param compressionBlockSize compression is triggered when the original input buffer size + * reaches this size + * @param memoryBlockSize the block size for original input buffer * @param pool the memory pool + * @param metrics the writer metrics */ - std::unique_ptr createCompressor(CompressionKind kind, - OutputStream* outStream, - CompressionStrategy strategy, - uint64_t bufferCapacity, - uint64_t compressionBlockSize, - MemoryPool& pool, WriterMetrics* metrics); + std::unique_ptr createCompressor( + CompressionKind kind, OutputStream* outStream, CompressionStrategy strategy, + uint64_t bufferCapacity, uint64_t compressionBlockSize, uint64_t memoryBlockSize, + MemoryPool& pool, WriterMetrics* metrics); } // namespace orc #endif diff --git a/c++/src/ConvertColumnReader.cc b/c++/src/ConvertColumnReader.cc index 67ee6d6c45..c0f88246e8 100644 --- a/c++/src/ConvertColumnReader.cc +++ b/c++/src/ConvertColumnReader.cc @@ -17,6 +17,9 @@ */ #include "ConvertColumnReader.hh" +#include "Utils.hh" + +#include namespace orc { @@ -72,6 +75,23 @@ namespace orc { } } + static inline void handleParseFromStringError(ColumnVectorBatch& dstBatch, uint64_t idx, + bool shouldThrow, const std::string& typeName, + const std::string& str, + const std::string& expectedFormat = "") { + if (!shouldThrow) { + dstBatch.notNull.data()[idx] = 0; + dstBatch.hasNulls = true; + } else { + std::ostringstream ss; + ss << "Failed to parse " << typeName << " from string:" << str; + if (expectedFormat != "") { + ss << " the following format \"" << expectedFormat << "\" is expected"; + } + throw SchemaEvolutionError(ss.str()); + } + } + // return false if overflow template static bool downCastToInteger(ReadType& dstValue, int64_t inputLong) { @@ -399,13 +419,14 @@ namespace orc { ConvertToTimestampColumnReader(const Type& readType, const Type& fileType, StripeStreams& stripe, bool throwOnOverflow) : ConvertColumnReader(readType, fileType, stripe, throwOnOverflow), - readerTimezone(readType.getKind() == TIMESTAMP_INSTANT ? &getTimezoneByName("GMT") - : &stripe.getReaderTimezone()), + isInstant(readType.getKind() == TIMESTAMP_INSTANT), + readerTimezone(isInstant ? &getTimezoneByName("GMT") : &stripe.getReaderTimezone()), needConvertTimezone(readerTimezone != &getTimezoneByName("GMT")) {} void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override; protected: + const bool isInstant; const orc::Timezone* readerTimezone; const bool needConvertTimezone; }; @@ -558,6 +579,8 @@ namespace orc { const auto& srcBatch = *SafeCastBatchTo(data.get()); auto& dstBatch = *SafeCastBatchTo(&rowBatch); + dstBatch.precision = toPrecision_; + dstBatch.scale = toScale_; for (uint64_t i = 0; i < numValues; ++i) { if (!rowBatch.hasNulls || rowBatch.notNull[i]) { convertDecimalToDecimal(dstBatch, i, srcBatch); @@ -694,6 +717,318 @@ namespace orc { const int32_t scale_; }; + template + class StringVariantToNumericColumnReader : public ConvertColumnReader { + public: + StringVariantToNumericColumnReader(const Type& readType, const Type& fileType, + StripeStreams& stripe, bool throwOnOverflow) + : ConvertColumnReader(readType, fileType, stripe, throwOnOverflow) {} + + void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override { + ConvertColumnReader::next(rowBatch, numValues, notNull); + + const auto& srcBatch = *SafeCastBatchTo(data.get()); + auto& dstBatch = *SafeCastBatchTo(&rowBatch); + for (uint64_t i = 0; i < numValues; ++i) { + if (!rowBatch.hasNulls || rowBatch.notNull[i]) { + if constexpr (std::is_floating_point_v) { + convertToDouble(dstBatch, srcBatch, i); + } else { + convertToInteger(dstBatch, srcBatch, i); + } + } + } + } + + private: + void convertToInteger(ReadTypeBatch& dstBatch, const StringVectorBatch& srcBatch, + uint64_t idx) { + int64_t longValue = 0; + const std::string longStr(srcBatch.data[idx], srcBatch.length[idx]); + try { + longValue = std::stoll(longStr); + } catch (...) { + handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Long", longStr); + return; + } + if constexpr (std::is_same_v) { + dstBatch.data[idx] = longValue == 0 ? 0 : 1; + } else { + if (!downCastToInteger(dstBatch.data[idx], longValue)) { + handleOverflow(dstBatch, idx, throwOnOverflow); + } + } + } + + void convertToDouble(ReadTypeBatch& dstBatch, const StringVectorBatch& srcBatch, uint64_t idx) { + const std::string floatValue(srcBatch.data[idx], srcBatch.length[idx]); + try { + if constexpr (std::is_same_v) { + dstBatch.data[idx] = std::stof(floatValue); + } else { + dstBatch.data[idx] = std::stod(floatValue); + } + } catch (...) { + handleParseFromStringError(dstBatch, idx, throwOnOverflow, typeid(readType).name(), + floatValue); + } + } + }; + + class StringVariantConvertColumnReader : public ConvertToStringVariantColumnReader { + public: + StringVariantConvertColumnReader(const Type& readType, const Type& fileType, + StripeStreams& stripe, bool throwOnOverflow) + : ConvertToStringVariantColumnReader(readType, fileType, stripe, throwOnOverflow) {} + + uint64_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override { + uint64_t size = 0; + strBuffer.resize(numValues); + const auto& srcBatch = *SafeCastBatchTo(data.get()); + const auto maxLength = readType.getMaximumLength(); + if (readType.getKind() == STRING) { + for (uint64_t i = 0; i < numValues; ++i) { + if (!rowBatch.hasNulls || rowBatch.notNull[i]) { + strBuffer[i] = std::string(srcBatch.data[i], srcBatch.length[i]); + size += strBuffer[i].size(); + } + } + } else if (readType.getKind() == VARCHAR) { + for (uint64_t i = 0; i < numValues; ++i) { + if (!rowBatch.hasNulls || rowBatch.notNull[i]) { + const char* charData = srcBatch.data[i]; + uint64_t originLength = srcBatch.length[i]; + uint64_t itemLength = Utf8Utils::truncateBytesTo(maxLength, charData, originLength); + strBuffer[i] = std::string(charData, itemLength); + size += strBuffer[i].length(); + } + } + } else if (readType.getKind() == CHAR) { + for (uint64_t i = 0; i < numValues; ++i) { + if (!rowBatch.hasNulls || rowBatch.notNull[i]) { + const char* charData = srcBatch.data[i]; + uint64_t originLength = srcBatch.length[i]; + uint64_t charLength = Utf8Utils::charLength(charData, originLength); + auto itemLength = Utf8Utils::truncateBytesTo(maxLength, charData, originLength); + strBuffer[i] = std::string(srcBatch.data[i], itemLength); + // the padding is exactly 1 byte per char + if (charLength < maxLength) { + strBuffer[i].resize(itemLength + maxLength - charLength, ' '); + } + size += strBuffer[i].length(); + } + } + } else { + throw SchemaEvolutionError("Invalid type for numeric to string conversion: " + + readType.toString()); + } + return size; + } + }; + + class StringVariantToTimestampColumnReader : public ConvertToTimestampColumnReader { + public: + StringVariantToTimestampColumnReader(const Type& readType, const Type& fileType, + StripeStreams& stripe, bool throwOnOverflow) + : ConvertToTimestampColumnReader(readType, fileType, stripe, throwOnOverflow) {} + + void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override { + ConvertToTimestampColumnReader::next(rowBatch, numValues, notNull); + + const auto& srcBatch = *SafeCastBatchTo(data.get()); + auto& dstBatch = *SafeCastBatchTo(&rowBatch); + + for (uint64_t i = 0; i < numValues; ++i) { + if (!rowBatch.hasNulls || rowBatch.notNull[i]) { + convertToTimestamp(dstBatch, i, std::string(srcBatch.data[i], srcBatch.length[i])); + } + } + } + + private: + // Algorithm: http://howardhinnant.github.io/date_algorithms.html + // The algorithm implements a proleptic Gregorian calendar. + int64_t daysFromProlepticGregorianCalendar(int32_t y, int32_t m, int32_t d) { + y -= m <= 2; + int32_t era = y / 400; + int32_t yoe = y - era * 400; // [0, 399] + int32_t doy = (153 * (m + (m > 2 ? -3 : 9)) + 2) / 5 + d - 1; // [0, 365] + int32_t doe = yoe * 365 + yoe / 4 - yoe / 100 + doy; // [0, 146096] + return 1ll * era * 146097 + doe - 719468; + } + + std::optional> tryBestToParseFromString( + const std::string& timeStr) { + int32_t year, month, day, hour, min, sec, nanos = 0; + int32_t matched = std::sscanf(timeStr.c_str(), "%4d-%2d-%2d %2d:%2d:%2d.%d", &year, &month, + &day, &hour, &min, &sec, &nanos); + if (matched != 6 && matched != 7) { + return std::nullopt; + } + if (nanos) { + if (nanos < 0 || nanos >= 1e9) { + return std::nullopt; + } + while (nanos < static_cast(1e8)) { + nanos *= 10; + } + } + int64_t daysSinceEpoch = daysFromProlepticGregorianCalendar(year, month, day); + int64_t secondSinceEpoch = 60ll * (60 * (24L * daysSinceEpoch + hour) + min) + sec; + return std::make_optional(std::pair{secondSinceEpoch, nanos}); + } + + void convertToTimestamp(TimestampVectorBatch& dstBatch, uint64_t idx, + const std::string& timeStr) { + // Expected timestamp_instant format string : yyyy-mm-dd hh:mm:ss[.xxx] timezone + // Eg. "2019-07-09 13:11:00 America/Los_Angeles" + // Expected timestamp format string : yyyy-mm-dd hh:mm:ss[.xxx] + // Eg. "2019-07-09 13:11:00" + static std::string expectedTimestampInstantFormat = "yyyy-mm-dd hh:mm:ss[.xxx] timezone"; + static std::string expectedTimestampFormat = "yyyy-mm-dd hh:mm:ss[.xxx]"; + auto timestamp = tryBestToParseFromString(timeStr); + if (!timestamp.has_value()) { + if (!isInstant) { + handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Timestamp", timeStr, + expectedTimestampFormat); + return; + } + handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Timestamp_Instant", timeStr, + expectedTimestampInstantFormat); + return; + } + + auto& [second, nanos] = timestamp.value(); + + if (isInstant) { + size_t pos = 0; // get the name of timezone + pos = timeStr.find(' ', pos) + 1; + pos = timeStr.find(' ', pos); + if (pos == std::string::npos) { + handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Timestamp_Instant", timeStr, + expectedTimestampInstantFormat); + return; + } + pos += 1; + size_t subStrLength = timeStr.length() - pos; + try { + second = getTimezoneByName(timeStr.substr(pos, subStrLength)).convertFromUTC(second); + } catch (const TimezoneError&) { + handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Timestamp_Instant", timeStr, + expectedTimestampInstantFormat); + return; + } + } else { + if (needConvertTimezone) { + second = readerTimezone->convertFromUTC(second); + } + } + dstBatch.data[idx] = second; + dstBatch.nanoseconds[idx] = nanos; + } + }; + + template + class StringVariantToDecimalColumnReader : public ConvertColumnReader { + public: + StringVariantToDecimalColumnReader(const Type& readType, const Type& fileType, + StripeStreams& stripe, bool throwOnOverflow) + : ConvertColumnReader(readType, fileType, stripe, throwOnOverflow), + precision_(static_cast(readType.getPrecision())), + scale_(static_cast(readType.getScale())) {} + + void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override { + ConvertColumnReader::next(rowBatch, numValues, notNull); + + const auto& srcBatch = *SafeCastBatchTo(data.get()); + auto& dstBatch = *SafeCastBatchTo(&rowBatch); + for (uint64_t i = 0; i < numValues; ++i) { + if (!rowBatch.hasNulls || rowBatch.notNull[i]) { + convertToDecimal(dstBatch, i, std::string(srcBatch.data[i], srcBatch.length[i])); + } + } + } + + private: + void convertToDecimal(ReadTypeBatch& dstBatch, uint64_t idx, const std::string& decimalStr) { + constexpr int32_t MAX_PRECISION_128 = 38; + int32_t fromPrecision = 0; + int32_t fromScale = 0; + uint32_t start = 0; + bool negative = false; + if (decimalStr.empty()) { + handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Decimal", decimalStr); + return; + } + auto dotPos = decimalStr.find('.'); + if (dotPos == std::string::npos) { + fromScale = 0; + fromPrecision = decimalStr.length(); + dotPos = decimalStr.length(); + } else { + if (dotPos + 1 == decimalStr.length()) { + handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Decimal", decimalStr); + return; + } + fromPrecision = decimalStr.length() - 1; + fromScale = decimalStr.length() - dotPos - 1; + } + if (decimalStr.front() == '-') { + negative = true; + start++; + fromPrecision--; + } + const std::string integerPortion = decimalStr.substr(start, dotPos - start); + if (dotPos == start || fromPrecision > MAX_PRECISION_128 || fromPrecision <= 0 || + !std::all_of(integerPortion.begin(), integerPortion.end(), ::isdigit)) { + handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Decimal", decimalStr); + return; + } + + Int128 i128; + try { + bool overflow = false; + i128 = Int128(integerPortion); + // overflow won't happen + i128 *= scaleUpInt128ByPowerOfTen(Int128(1), fromScale, overflow); + } catch (const std::exception& e) { + handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Decimal", decimalStr); + return; + } + if (dotPos + 1 < decimalStr.length()) { + const std::string fractionPortion = decimalStr.substr(dotPos + 1, fromScale); + if (!std::all_of(fractionPortion.begin(), fractionPortion.end(), ::isdigit)) { + handleOverflow(dstBatch, idx, throwOnOverflow); + return; + } + i128 += Int128(fractionPortion); + } + + auto [overflow, result] = convertDecimal(i128, fromScale, precision_, scale_); + if (overflow) { + handleOverflow(dstBatch, idx, throwOnOverflow); + return; + } + if (negative) { + result.negate(); + } + + if constexpr (std::is_same_v) { + dstBatch.values[idx] = result; + } else { + if (!result.fitsInLong()) { + handleOverflow(dstBatch, idx, + throwOnOverflow); + } else { + dstBatch.values[idx] = result.toLong(); + } + } + } + + const int32_t precision_; + const int32_t scale_; + }; + #define DEFINE_NUMERIC_CONVERT_READER(FROM, TO, TYPE) \ using FROM##To##TO##ColumnReader = \ NumericConvertColumnReader; @@ -730,6 +1065,18 @@ namespace orc { using Decimal64To##TO##ColumnReader = DecimalToStringVariantColumnReader; \ using Decimal128To##TO##ColumnReader = DecimalToStringVariantColumnReader; +#define DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(FROM, TO, TYPE) \ + using FROM##To##TO##ColumnReader = StringVariantToNumericColumnReader; + +#define DEFINE_STRING_VARIANT_CONVERT_READER(FROM, TO) \ + using FROM##To##TO##ColumnReader = StringVariantConvertColumnReader; + +#define DEFINE_STRING_VARIANT_CONVERT_TO_TIMESTAMP_READER(FROM, TO) \ + using FROM##To##TO##ColumnReader = StringVariantToTimestampColumnReader; + +#define DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(FROM, TO) \ + using FROM##To##TO##ColumnReader = StringVariantToDecimalColumnReader; + DEFINE_NUMERIC_CONVERT_READER(Boolean, Byte, int8_t) DEFINE_NUMERIC_CONVERT_READER(Boolean, Short, int16_t) DEFINE_NUMERIC_CONVERT_READER(Boolean, Int, int32_t) @@ -834,8 +1181,57 @@ namespace orc { DEFINE_DECIMAL_CONVERT_TO_STRING_VARINT_READER(Char) DEFINE_DECIMAL_CONVERT_TO_STRING_VARINT_READER(Varchar) + // String variant to numeric + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Boolean, bool) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Byte, int8_t) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Short, int16_t) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Int, int32_t) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Long, int64_t) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Float, float) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Double, double) + + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Boolean, bool) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Byte, int8_t) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Short, int16_t) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Int, int32_t) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Long, int64_t) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Float, float) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Double, double) + + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Boolean, bool) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Byte, int8_t) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Short, int16_t) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Int, int32_t) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Long, int64_t) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Float, float) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Double, double) + + // String variant to string variant + DEFINE_STRING_VARIANT_CONVERT_READER(String, String) + DEFINE_STRING_VARIANT_CONVERT_READER(String, Char) + DEFINE_STRING_VARIANT_CONVERT_READER(String, Varchar) + DEFINE_STRING_VARIANT_CONVERT_READER(Char, Char) + DEFINE_STRING_VARIANT_CONVERT_READER(Char, String) + DEFINE_STRING_VARIANT_CONVERT_READER(Char, Varchar) + DEFINE_STRING_VARIANT_CONVERT_READER(Varchar, String) + DEFINE_STRING_VARIANT_CONVERT_READER(Varchar, Char) + DEFINE_STRING_VARIANT_CONVERT_READER(Varchar, Varchar) + + // String variant to timestamp + DEFINE_STRING_VARIANT_CONVERT_TO_TIMESTAMP_READER(String, Timestamp) + DEFINE_STRING_VARIANT_CONVERT_TO_TIMESTAMP_READER(Char, Timestamp) + DEFINE_STRING_VARIANT_CONVERT_TO_TIMESTAMP_READER(Varchar, Timestamp) + + // String variant to decimal + DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(String, Decimal64) + DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(String, Decimal128) + DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(Char, Decimal64) + DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(Char, Decimal128) + DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(Varchar, Decimal64) + DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(Varchar, Decimal128) + #define CREATE_READER(NAME) \ - return std::make_unique(_readType, fileType, stripe, throwOnOverflow); + return std::make_unique(readType, fileType, stripe, throwOnOverflow); #define CASE_CREATE_READER(TYPE, CONVERT) \ case TYPE: \ @@ -858,7 +1254,7 @@ namespace orc { #define CASE_CREATE_DECIMAL_READER(FROM) \ case DECIMAL: { \ - if (isDecimal64(_readType)) { \ + if (isDecimal64(readType)) { \ CREATE_READER(FROM##ToDecimal64ColumnReader) \ } else { \ CREATE_READER(FROM##ToDecimal128ColumnReader) \ @@ -868,7 +1264,7 @@ namespace orc { #define CASE_EXCEPTION \ default: \ throw SchemaEvolutionError("Cannot convert from " + fileType.toString() + " to " + \ - _readType.toString()); + readType.toString()); std::unique_ptr buildConvertReader(const Type& fileType, StripeStreams& stripe, bool useTightNumericVector, @@ -878,11 +1274,11 @@ namespace orc { "SchemaEvolution only support tight vector, please create ColumnVectorBatch with " "option useTightNumericVector"); } - const auto& _readType = *stripe.getSchemaEvolution()->getReadType(fileType); + const auto& readType = *stripe.getSchemaEvolution()->getReadType(fileType); switch (fileType.getKind()) { case BOOLEAN: { - switch (_readType.getKind()) { + switch (readType.getKind()) { CASE_CREATE_READER(BYTE, BooleanToByte) CASE_CREATE_READER(SHORT, BooleanToShort) CASE_CREATE_READER(INT, BooleanToInt) @@ -906,7 +1302,7 @@ namespace orc { } } case BYTE: { - switch (_readType.getKind()) { + switch (readType.getKind()) { CASE_CREATE_READER(BOOLEAN, ByteToBoolean) CASE_CREATE_READER(SHORT, ByteToShort) CASE_CREATE_READER(INT, ByteToInt) @@ -930,7 +1326,7 @@ namespace orc { } } case SHORT: { - switch (_readType.getKind()) { + switch (readType.getKind()) { CASE_CREATE_READER(BOOLEAN, ShortToBoolean) CASE_CREATE_READER(BYTE, ShortToByte) CASE_CREATE_READER(INT, ShortToInt) @@ -954,7 +1350,7 @@ namespace orc { } } case INT: { - switch (_readType.getKind()) { + switch (readType.getKind()) { CASE_CREATE_READER(BOOLEAN, IntToBoolean) CASE_CREATE_READER(BYTE, IntToByte) CASE_CREATE_READER(SHORT, IntToShort) @@ -978,7 +1374,7 @@ namespace orc { } } case LONG: { - switch (_readType.getKind()) { + switch (readType.getKind()) { CASE_CREATE_READER(BOOLEAN, LongToBoolean) CASE_CREATE_READER(BYTE, LongToByte) CASE_CREATE_READER(SHORT, LongToShort) @@ -1002,7 +1398,7 @@ namespace orc { } } case FLOAT: { - switch (_readType.getKind()) { + switch (readType.getKind()) { CASE_CREATE_READER(BOOLEAN, FloatToBoolean) CASE_CREATE_READER(BYTE, FloatToByte) CASE_CREATE_READER(SHORT, FloatToShort) @@ -1026,7 +1422,7 @@ namespace orc { } } case DOUBLE: { - switch (_readType.getKind()) { + switch (readType.getKind()) { CASE_CREATE_READER(BOOLEAN, DoubleToBoolean) CASE_CREATE_READER(BYTE, DoubleToByte) CASE_CREATE_READER(SHORT, DoubleToShort) @@ -1050,7 +1446,7 @@ namespace orc { } } case DECIMAL: { - switch (_readType.getKind()) { + switch (readType.getKind()) { CASE_CREATE_FROM_DECIMAL_READER(BOOLEAN, Boolean) CASE_CREATE_FROM_DECIMAL_READER(BYTE, Byte) CASE_CREATE_FROM_DECIMAL_READER(SHORT, Short) @@ -1065,13 +1461,13 @@ namespace orc { CASE_CREATE_FROM_DECIMAL_READER(TIMESTAMP_INSTANT, Timestamp) case DECIMAL: { if (isDecimal64(fileType)) { - if (isDecimal64(_readType)) { + if (isDecimal64(readType)) { CREATE_READER(Decimal64ToDecimal64ColumnReader) } else { CREATE_READER(Decimal64ToDecimal128ColumnReader) } } else { - if (isDecimal64(_readType)) { + if (isDecimal64(readType)) { CREATE_READER(Decimal128ToDecimal64ColumnReader) } else { CREATE_READER(Decimal128ToDecimal128ColumnReader) @@ -1087,7 +1483,96 @@ namespace orc { CASE_EXCEPTION } } - case STRING: + case STRING: { + switch (readType.getKind()) { + CASE_CREATE_READER(BOOLEAN, StringToBoolean) + CASE_CREATE_READER(BYTE, StringToByte) + CASE_CREATE_READER(SHORT, StringToShort) + CASE_CREATE_READER(INT, StringToInt) + CASE_CREATE_READER(LONG, StringToLong) + CASE_CREATE_READER(FLOAT, StringToFloat) + CASE_CREATE_READER(DOUBLE, StringToDouble) + CASE_CREATE_READER(STRING, StringToString) + CASE_CREATE_READER(CHAR, StringToChar) + CASE_CREATE_READER(VARCHAR, StringToVarchar) + CASE_CREATE_READER(TIMESTAMP, StringToTimestamp) + CASE_CREATE_READER(TIMESTAMP_INSTANT, StringToTimestamp) + case DECIMAL: { + if (isDecimal64(readType)) { + CREATE_READER(StringToDecimal64ColumnReader) + } else { + CREATE_READER(StringToDecimal128ColumnReader) + } + } + case BINARY: + case LIST: + case MAP: + case STRUCT: + case UNION: + case DATE: + CASE_EXCEPTION + } + } + case CHAR: { + switch (readType.getKind()) { + CASE_CREATE_READER(BOOLEAN, CharToBoolean) + CASE_CREATE_READER(BYTE, CharToByte) + CASE_CREATE_READER(SHORT, CharToShort) + CASE_CREATE_READER(INT, CharToInt) + CASE_CREATE_READER(LONG, CharToLong) + CASE_CREATE_READER(FLOAT, CharToFloat) + CASE_CREATE_READER(DOUBLE, CharToDouble) + CASE_CREATE_READER(STRING, CharToString) + CASE_CREATE_READER(CHAR, CharToChar) + CASE_CREATE_READER(VARCHAR, CharToVarchar) + CASE_CREATE_READER(TIMESTAMP, CharToTimestamp) + CASE_CREATE_READER(TIMESTAMP_INSTANT, CharToTimestamp) + case DECIMAL: { + if (isDecimal64(readType)) { + CREATE_READER(CharToDecimal64ColumnReader) + } else { + CREATE_READER(CharToDecimal128ColumnReader) + } + } + case BINARY: + case LIST: + case MAP: + case STRUCT: + case UNION: + case DATE: + CASE_EXCEPTION + } + } + case VARCHAR: { + switch (readType.getKind()) { + CASE_CREATE_READER(BOOLEAN, VarcharToBoolean) + CASE_CREATE_READER(BYTE, VarcharToByte) + CASE_CREATE_READER(SHORT, VarcharToShort) + CASE_CREATE_READER(INT, VarcharToInt) + CASE_CREATE_READER(LONG, VarcharToLong) + CASE_CREATE_READER(FLOAT, VarcharToFloat) + CASE_CREATE_READER(DOUBLE, VarcharToDouble) + CASE_CREATE_READER(STRING, VarcharToString) + CASE_CREATE_READER(CHAR, VarcharToChar) + CASE_CREATE_READER(VARCHAR, VarcharToVarchar) + CASE_CREATE_READER(TIMESTAMP, VarcharToTimestamp) + CASE_CREATE_READER(TIMESTAMP_INSTANT, VarcharToTimestamp) + case DECIMAL: { + if (isDecimal64(readType)) { + CREATE_READER(VarcharToDecimal64ColumnReader) + } else { + CREATE_READER(VarcharToDecimal128ColumnReader) + } + } + case BINARY: + case LIST: + case MAP: + case STRUCT: + case UNION: + case DATE: + CASE_EXCEPTION + } + } case BINARY: case TIMESTAMP: case LIST: @@ -1095,21 +1580,9 @@ namespace orc { case STRUCT: case UNION: case DATE: - case VARCHAR: - case CHAR: case TIMESTAMP_INSTANT: CASE_EXCEPTION } } -#undef DEFINE_NUMERIC_CONVERT_READER -#undef DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER -#undef DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER -#undef DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER -#undef DEFINE_DECIMAL_CONVERT_TO_NUMERIC_READER -#undef DEFINE_DECIMAL_CONVERT_TO_DECIMAL_READER -#undef CASE_CREATE_FROM_DECIMAL_READER -#undef CASE_CREATE_READER -#undef CASE_EXCEPTION - } // namespace orc diff --git a/c++/src/CpuInfoUtil.cc b/c++/src/CpuInfoUtil.cc index 82669de20a..588f8dc96a 100644 --- a/c++/src/CpuInfoUtil.cc +++ b/c++/src/CpuInfoUtil.cc @@ -74,7 +74,7 @@ namespace orc { #if defined(_WIN32) //------------------------------ WINDOWS ------------------------------// - void OsRetrieveCacheSize(std::array* cache_sizes) { + void OsRetrieveCacheSize(std::array* cacheSizes) { PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = nullptr; PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer_position = nullptr; DWORD buffer_size = 0; @@ -108,8 +108,8 @@ namespace orc { if (RelationCache == buffer_position->Relationship) { PCACHE_DESCRIPTOR cache = &buffer_position->Cache; if (cache->Level >= 1 && cache->Level <= kCacheLevels) { - const int64_t current = (*cache_sizes)[cache->Level - 1]; - (*cache_sizes)[cache->Level - 1] = std::max(current, cache->Size); + const int64_t current = (*cacheSizes)[cache->Level - 1]; + (*cacheSizes)[cache->Level - 1] = std::max(current, cache->Size); } } offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); @@ -136,23 +136,22 @@ namespace orc { } #endif // MINGW - void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor, - std::string* model_name) { + void OsRetrieveCpuInfo(int64_t* hardwareFlags, CpuInfo::Vendor* vendor, + std::string* modelName) { int register_EAX_id = 1; int highest_valid_id = 0; int highest_extended_valid_id = 0; std::bitset<32> features_ECX; - std::array cpu_info; + std::array cpuInfo; // Get highest valid id - __cpuid(cpu_info.data(), 0); - highest_valid_id = cpu_info[0]; + __cpuid(cpuInfo.data(), 0); + highest_valid_id = cpuInfo[0]; // HEX of "GenuineIntel": 47656E75 696E6549 6E74656C // HEX of "AuthenticAMD": 41757468 656E7469 63414D44 - if (cpu_info[1] == 0x756e6547 && cpu_info[3] == 0x49656e69 && cpu_info[2] == 0x6c65746e) { + if (cpuInfo[1] == 0x756e6547 && cpuInfo[3] == 0x49656e69 && cpuInfo[2] == 0x6c65746e) { *vendor = CpuInfo::Vendor::Intel; - } else if (cpu_info[1] == 0x68747541 && cpu_info[3] == 0x69746e65 && - cpu_info[2] == 0x444d4163) { + } else if (cpuInfo[1] == 0x68747541 && cpuInfo[3] == 0x69746e65 && cpuInfo[2] == 0x444d4163) { *vendor = CpuInfo::Vendor::AMD; } @@ -161,19 +160,19 @@ namespace orc { } // EAX=1: Processor Info and Feature Bits - __cpuidex(cpu_info.data(), register_EAX_id, 0); - features_ECX = cpu_info[2]; + __cpuidex(cpuInfo.data(), register_EAX_id, 0); + features_ECX = cpuInfo[2]; // Get highest extended id - __cpuid(cpu_info.data(), 0x80000000); - highest_extended_valid_id = cpu_info[0]; + __cpuid(cpuInfo.data(), 0x80000000); + highest_extended_valid_id = cpuInfo[0]; // Retrieve CPU model name if (highest_extended_valid_id >= static_cast(0x80000004)) { - model_name->clear(); + modelName->clear(); for (int i = 0x80000002; i <= static_cast(0x80000004); ++i) { - __cpuidex(cpu_info.data(), i, 0); - *model_name += std::string(reinterpret_cast(cpu_info.data()), sizeof(cpu_info)); + __cpuidex(cpuInfo.data(), i, 0); + *modelName += std::string(reinterpret_cast(cpuInfo.data()), sizeof(cpuInfo)); } } @@ -184,37 +183,37 @@ namespace orc { zmm_enabled = (xcr0 & 0xE0) == 0xE0; } - if (features_ECX[9]) *hardware_flags |= CpuInfo::SSSE3; - if (features_ECX[19]) *hardware_flags |= CpuInfo::SSE4_1; - if (features_ECX[20]) *hardware_flags |= CpuInfo::SSE4_2; - if (features_ECX[23]) *hardware_flags |= CpuInfo::POPCNT; - if (features_ECX[28]) *hardware_flags |= CpuInfo::AVX; + if (features_ECX[9]) *hardwareFlags |= CpuInfo::SSSE3; + if (features_ECX[19]) *hardwareFlags |= CpuInfo::SSE4_1; + if (features_ECX[20]) *hardwareFlags |= CpuInfo::SSE4_2; + if (features_ECX[23]) *hardwareFlags |= CpuInfo::POPCNT; + if (features_ECX[28]) *hardwareFlags |= CpuInfo::AVX; // cpuid with EAX=7, ECX=0: Extended Features register_EAX_id = 7; if (highest_valid_id > register_EAX_id) { - __cpuidex(cpu_info.data(), register_EAX_id, 0); - std::bitset<32> features_EBX = cpu_info[1]; + __cpuidex(cpuInfo.data(), register_EAX_id, 0); + std::bitset<32> features_EBX = cpuInfo[1]; - if (features_EBX[3]) *hardware_flags |= CpuInfo::BMI1; - if (features_EBX[5]) *hardware_flags |= CpuInfo::AVX2; - if (features_EBX[8]) *hardware_flags |= CpuInfo::BMI2; + if (features_EBX[3]) *hardwareFlags |= CpuInfo::BMI1; + if (features_EBX[5]) *hardwareFlags |= CpuInfo::AVX2; + if (features_EBX[8]) *hardwareFlags |= CpuInfo::BMI2; if (zmm_enabled) { - if (features_EBX[16]) *hardware_flags |= CpuInfo::AVX512F; - if (features_EBX[17]) *hardware_flags |= CpuInfo::AVX512DQ; - if (features_EBX[28]) *hardware_flags |= CpuInfo::AVX512CD; - if (features_EBX[30]) *hardware_flags |= CpuInfo::AVX512BW; - if (features_EBX[31]) *hardware_flags |= CpuInfo::AVX512VL; + if (features_EBX[16]) *hardwareFlags |= CpuInfo::AVX512F; + if (features_EBX[17]) *hardwareFlags |= CpuInfo::AVX512DQ; + if (features_EBX[28]) *hardwareFlags |= CpuInfo::AVX512CD; + if (features_EBX[30]) *hardwareFlags |= CpuInfo::AVX512BW; + if (features_EBX[31]) *hardwareFlags |= CpuInfo::AVX512VL; } } } #elif defined(CPUINFO_ARCH_ARM) // Windows on Arm - void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor, - std::string* model_name) { - *hardware_flags |= CpuInfo::ASIMD; - // TODO: vendor, model_name + void OsRetrieveCpuInfo(int64_t* hardwareFlags, CpuInfo::Vendor* vendor, + std::string* modelName) { + *hardwareFlags |= CpuInfo::ASIMD; + // TODO: vendor, modelName } #endif @@ -236,25 +235,25 @@ namespace orc { return std::nullopt; } - void OsRetrieveCacheSize(std::array* cache_sizes) { + void OsRetrieveCacheSize(std::array* cacheSizes) { static_assert(kCacheLevels >= 3, ""); auto c = IntegerSysCtlByName("hw.l1dcachesize"); if (c.has_value()) { - (*cache_sizes)[0] = *c; + (*cacheSizes)[0] = *c; } c = IntegerSysCtlByName("hw.l2cachesize"); if (c.has_value()) { - (*cache_sizes)[1] = *c; + (*cacheSizes)[1] = *c; } c = IntegerSysCtlByName("hw.l3cachesize"); if (c.has_value()) { - (*cache_sizes)[2] = *c; + (*cacheSizes)[2] = *c; } } - void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor, - std::string* model_name) { - // hardware_flags + void OsRetrieveCpuInfo(int64_t* hardwareFlags, CpuInfo::Vendor* vendor, + std::string* modelName) { + // hardwareFlags struct SysCtlCpuFeature { const char* name; int64_t flag; @@ -280,13 +279,13 @@ namespace orc { for (const auto& feature : features) { auto v = IntegerSysCtlByName(feature.name); if (v.value_or(0)) { - *hardware_flags |= feature.flag; + *hardwareFlags |= feature.flag; } } - // TODO: vendor, model_name + // TODO: vendor, modelName *vendor = CpuInfo::Vendor::Unknown; - *model_name = "Unknown"; + *modelName = "Unknown"; } #else @@ -345,7 +344,7 @@ namespace orc { const struct { std::string name; int64_t flag; - } flag_mappings[] = { + } flagMappings[] = { #if defined(CPUINFO_ARCH_X86) {"ssse3", CpuInfo::SSSE3}, {"sse4_1", CpuInfo::SSE4_1}, @@ -364,12 +363,12 @@ namespace orc { {"asimd", CpuInfo::ASIMD}, #endif }; - const int64_t num_flags = sizeof(flag_mappings) / sizeof(flag_mappings[0]); + const int64_t num_flags = sizeof(flagMappings) / sizeof(flagMappings[0]); int64_t flags = 0; for (int i = 0; i < num_flags; ++i) { - if (values.find(flag_mappings[i].name) != std::string::npos) { - flags |= flag_mappings[i].flag; + if (values.find(flagMappings[i].name) != std::string::npos) { + flags |= flagMappings[i].flag; } } return flags; @@ -469,9 +468,9 @@ namespace orc { #elif defined(CPUINFO_ARCH_ARM) //------------------------------ AARCH64 ------------------------------// - bool ArchParseUserSimdLevel(const std::string& simd_level, int64_t* hardware_flags) { - if (simd_level == "NONE") { - *hardware_flags &= ~CpuInfo::ASIMD; + bool ArchParseUserSimdLevel(const std::string& simdLevel, int64_t* hardwareFlags) { + if (simdLevel == "NONE") { + *hardwareFlags &= ~CpuInfo::ASIMD; return true; } return false; @@ -485,7 +484,7 @@ namespace orc { #else //------------------------------ PPC, ... ------------------------------// - bool ArchParseUserSimdLevel(const std::string& simd_level, int64_t* hardware_flags) { + bool ArchParseUserSimdLevel(const std::string& simdLevel, int64_t* hardwareFlags) { return true; } @@ -496,17 +495,17 @@ namespace orc { } // namespace struct CpuInfo::Impl { - int64_t hardware_flags = 0; + int64_t hardwareFlags = 0; int numCores = 0; - int64_t original_hardware_flags = 0; + int64_t originalHardwareFlags = 0; Vendor vendor = Vendor::Unknown; - std::string model_name = "Unknown"; - std::array cache_sizes{}; + std::string modelName = "Unknown"; + std::array cacheSizes{}; Impl() { - OsRetrieveCacheSize(&cache_sizes); - OsRetrieveCpuInfo(&hardware_flags, &vendor, &model_name); - original_hardware_flags = hardware_flags; + OsRetrieveCacheSize(&cacheSizes); + OsRetrieveCpuInfo(&hardwareFlags, &vendor, &modelName); + originalHardwareFlags = hardwareFlags; numCores = std::max(static_cast(std::thread::hardware_concurrency()), 1); // parse user simd level @@ -514,7 +513,7 @@ namespace orc { std::string userSimdLevel = maybe_env_var == nullptr ? "NONE" : std::string(maybe_env_var); std::transform(userSimdLevel.begin(), userSimdLevel.end(), userSimdLevel.begin(), [](unsigned char c) { return std::toupper(c); }); - if (!ArchParseUserSimdLevel(userSimdLevel, &hardware_flags)) { + if (!ArchParseUserSimdLevel(userSimdLevel, &hardwareFlags)) { throw ParseError("Invalid value for ORC_USER_SIMD_LEVEL: " + userSimdLevel); } } @@ -530,8 +529,8 @@ namespace orc { #endif const CpuInfo* CpuInfo::getInstance() { - static CpuInfo cpu_info; - return &cpu_info; + static CpuInfo cpuInfo; + return &cpuInfo; } #ifdef __clang__ @@ -539,7 +538,7 @@ namespace orc { #endif int64_t CpuInfo::hardwareFlags() const { - return impl_->hardware_flags; + return impl_->hardwareFlags; } int CpuInfo::numCores() const { @@ -551,7 +550,7 @@ namespace orc { } const std::string& CpuInfo::modelName() const { - return impl_->model_name; + return impl_->modelName; } int64_t CpuInfo::cacheSize(CacheLevel level) const { @@ -564,18 +563,18 @@ namespace orc { static_assert(static_cast(CacheLevel::L1) == 0, ""); const int i = static_cast(level); - if (impl_->cache_sizes[i] > 0) return impl_->cache_sizes[i]; + if (impl_->cacheSizes[i] > 0) return impl_->cacheSizes[i]; if (i == 0) return kDefaultCacheSizes[0]; // l3 may be not available, return maximum of l2 or default size - return std::max(kDefaultCacheSizes[i], impl_->cache_sizes[i - 1]); + return std::max(kDefaultCacheSizes[i], impl_->cacheSizes[i - 1]); } bool CpuInfo::isSupported(int64_t flags) const { - return (impl_->hardware_flags & flags) == flags; + return (impl_->hardwareFlags & flags) == flags; } bool CpuInfo::isDetected(int64_t flags) const { - return (impl_->original_hardware_flags & flags) == flags; + return (impl_->originalHardwareFlags & flags) == flags; } void CpuInfo::verifyCpuRequirements() const { diff --git a/c++/src/Exceptions.cc b/c++/src/Exceptions.cc index 30ecf7dc7c..2ba1ab404c 100644 --- a/c++/src/Exceptions.cc +++ b/c++/src/Exceptions.cc @@ -84,4 +84,20 @@ namespace orc { SchemaEvolutionError::~SchemaEvolutionError() noexcept { // PASS } + + CompressionError::CompressionError(const std::string& whatArg) : runtime_error(whatArg) { + // PASS + } + + CompressionError::CompressionError(const char* whatArg) : runtime_error(whatArg) { + // PASS + } + + CompressionError::CompressionError(const CompressionError& error) : runtime_error(error) { + // PASS + } + + CompressionError::~CompressionError() noexcept { + // PASS + } } // namespace orc diff --git a/c++/src/Int128.cc b/c++/src/Int128.cc index 4a1d0b763a..1e059fd4e2 100644 --- a/c++/src/Int128.cc +++ b/c++/src/Int128.cc @@ -27,7 +27,7 @@ namespace orc { Int128 Int128::maximumValue() { - return Int128(0x7fffffffffffffff, 0xfffffffffffffff); + return Int128(0x7fffffffffffffff, 0xffffffffffffffff); } Int128 Int128::minimumValue() { diff --git a/c++/src/LzoDecompressor.cc b/c++/src/LzoDecompressor.cc index f494f4b651..68e25425c2 100644 --- a/c++/src/LzoDecompressor.cc +++ b/c++/src/LzoDecompressor.cc @@ -342,7 +342,7 @@ namespace orc { char* literalOutputLimit = output + literalLength; if (literalOutputLimit > fastOutputLimit || input + literalLength > inputLimit - SIZE_OF_LONG) { - if (literalOutputLimit > outputLimit) { + if (literalOutputLimit > outputLimit || input + literalLength > inputLimit) { throw MalformedInputException(input - inputAddress); } diff --git a/c++/src/Options.hh b/c++/src/Options.hh index daf9d52e1c..0a4bd56d8f 100644 --- a/c++/src/Options.hh +++ b/c++/src/Options.hh @@ -23,6 +23,8 @@ #include "orc/OrcFile.hh" #include "orc/Reader.hh" +#include "io/Cache.hh" + #include namespace orc { @@ -43,6 +45,7 @@ namespace orc { MemoryPool* memoryPool; std::string serializedTail; ReaderMetrics* metrics; + CacheOptions cacheOptions; ReaderOptionsPrivate() { tailLocation = std::numeric_limits::max(); @@ -122,6 +125,15 @@ namespace orc { return privateBits_->errorStream; } + ReaderOptions& ReaderOptions::setCacheOptions(const CacheOptions& cacheOptions) { + privateBits_->cacheOptions = cacheOptions; + return *this; + } + + const CacheOptions& ReaderOptions::getCacheOptions() const { + return privateBits_->cacheOptions; + } + /** * RowReaderOptions Implementation */ diff --git a/c++/src/OrcFile.cc b/c++/src/OrcFile.cc index 8899299d3d..be86724329 100644 --- a/c++/src/OrcFile.cc +++ b/c++/src/OrcFile.cc @@ -79,7 +79,7 @@ namespace orc { } void read(void* buf, uint64_t length, uint64_t offset) override { - SCOPED_STOPWATCH(metrics, IOBlockingLatencyUs, IOCount); + SCOPED_STOPWATCH(metrics_, IOBlockingLatencyUs, IOCount); if (!buf) { throw ParseError("Buffer is null"); } diff --git a/c++/src/OrcHdfsFile.cc b/c++/src/OrcHdfsFile.cc index 09ff71a0e9..d878e276cb 100644 --- a/c++/src/OrcHdfsFile.cc +++ b/c++/src/OrcHdfsFile.cc @@ -42,23 +42,23 @@ namespace orc { class HdfsFileInputStream : public InputStream { private: - std::string filename; - std::unique_ptr file; - std::unique_ptr file_system; - uint64_t totalLength; - const uint64_t READ_SIZE = 1024 * 1024; // 1 MB - ReaderMetrics* metrics; + std::string filename_; + std::unique_ptr file_; + std::unique_ptr fileSystem_; + uint64_t totalLength_; + const uint64_t readSize_ = 1024 * 1024; // 1 MB + ReaderMetrics* metrics_; public: - HdfsFileInputStream(std::string _filename, ReaderMetrics* _metrics) : metrics(_metrics) { - filename = _filename; + HdfsFileInputStream(std::string filename, ReaderMetrics* metrics) : metrics_(metrics) { + filename_ = filename; // Building a URI object from the given uri_path hdfs::URI uri; try { - uri = hdfs::URI::parse_from_string(filename); + uri = hdfs::URI::parse_from_string(filename_); } catch (const hdfs::uri_parse_error&) { - throw ParseError("Malformed URI: " + filename); + throw ParseError("Malformed URI: " + filename_); } // This sets conf path to default "$HADOOP_CONF_DIR" or "/etc/hadoop/conf" @@ -82,9 +82,9 @@ namespace orc { } hdfs::IoService* io_service = hdfs::IoService::New(); // Wrapping file_system into a unique pointer to guarantee deletion - file_system = + fileSystem_ = std::unique_ptr(hdfs::FileSystem::New(io_service, "", options)); - if (file_system.get() == nullptr) { + if (fileSystem_.get() == nullptr) { throw ParseError("Can't create FileSystem object. "); } hdfs::Status status; @@ -92,13 +92,13 @@ namespace orc { if (!uri.get_host().empty()) { // Using port if supplied, otherwise using "" to look up port in configs std::string port = uri.has_port() ? std::to_string(uri.get_port()) : ""; - status = file_system->Connect(uri.get_host(), port); + status = fileSystem_->Connect(uri.get_host(), port); if (!status.ok()) { throw ParseError("Can't connect to " + uri.get_host() + ":" + port + ". " + status.ToString()); } } else { - status = file_system->ConnectToDefaultFs(); + status = fileSystem_->ConnectToDefaultFs(); if (!status.ok()) { if (!options.defaultFS.get_host().empty()) { throw ParseError("Error connecting to " + options.defaultFS.str() + ". " + @@ -110,32 +110,32 @@ namespace orc { } } - if (file_system.get() == nullptr) { + if (fileSystem_.get() == nullptr) { throw ParseError("Can't connect the file system. "); } hdfs::FileHandle* file_raw = nullptr; - status = file_system->Open(uri.get_path(true), &file_raw); + status = fileSystem_->Open(uri.get_path(true), &file_raw); if (!status.ok()) { throw ParseError("Can't open " + uri.get_path(true) + ". " + status.ToString()); } // Wrapping file_raw into a unique pointer to guarantee deletion - file.reset(file_raw); + file_.reset(file_raw); hdfs::StatInfo stat_info; - status = file_system->GetFileInfo(uri.get_path(true), stat_info); + status = fileSystem_->GetFileInfo(uri.get_path(true), stat_info); if (!status.ok()) { throw ParseError("Can't stat " + uri.get_path(true) + ". " + status.ToString()); } - totalLength = stat_info.length; + totalLength_ = stat_info.length; } uint64_t getLength() const override { - return totalLength; + return totalLength_; } uint64_t getNaturalReadSize() const override { - return READ_SIZE; + return readSize_; } void read(void* buf, uint64_t length, uint64_t offset) override { @@ -151,8 +151,8 @@ namespace orc { do { status = - file->PositionRead(buf_ptr, static_cast(length) - total_bytes_read, - static_cast(offset + total_bytes_read), &last_bytes_read); + file_->PositionRead(buf_ptr, static_cast(length) - total_bytes_read, + static_cast(offset + total_bytes_read), &last_bytes_read); if (!status.ok()) { throw ParseError("Error reading the file: " + status.ToString()); } @@ -162,7 +162,7 @@ namespace orc { } const std::string& getName() const override { - return filename; + return filename_; } ~HdfsFileInputStream() override; diff --git a/c++/src/RLE.cc b/c++/src/RLE.cc index 89aca6a10e..cb831c80f7 100644 --- a/c++/src/RLE.cc +++ b/c++/src/RLE.cc @@ -108,15 +108,23 @@ namespace orc { void RleEncoder::recordPosition(PositionRecorder* recorder) const { uint64_t flushedSize = outputStream->getSize(); - uint64_t unflushedSize = static_cast(bufferPosition); + uint64_t unusedBufferSize = static_cast(bufferLength - bufferPosition); if (outputStream->isCompressed()) { recorder->add(flushedSize); - recorder->add(unflushedSize); + // There are multiple blocks in the input buffer, but bufferPosition only records the + // effective length of the last block. We need rawInputBufferSize to record the total length + // of all variable blocks. + recorder->add(outputStream->getRawInputBufferSize() - unusedBufferSize); } else { - flushedSize -= static_cast(bufferLength); - recorder->add(flushedSize + unflushedSize); + recorder->add(flushedSize - unusedBufferSize); } recorder->add(static_cast(numLiterals)); } + void RleEncoder::finishEncode() { + outputStream->BackUp(static_cast(bufferLength - bufferPosition)); + outputStream->finishStream(); + bufferLength = bufferPosition = 0; + } + } // namespace orc diff --git a/c++/src/RLE.hh b/c++/src/RLE.hh index a45b4056bc..e46504e885 100644 --- a/c++/src/RLE.hh +++ b/c++/src/RLE.hh @@ -84,6 +84,13 @@ namespace orc { virtual void write(int64_t val) = 0; + /** + * Finalize the encoding process. This function should be called after all data required for + * encoding has been added. It ensures that any remaining data is processed and the final state + * of the encoder is set. + */ + virtual void finishEncode(); + protected: std::unique_ptr outputStream; size_t bufferPosition; diff --git a/c++/src/RLEv1.cc b/c++/src/RLEv1.cc index 5d6f600669..72c555e610 100644 --- a/c++/src/RLEv1.cc +++ b/c++/src/RLEv1.cc @@ -74,10 +74,8 @@ namespace orc { } uint64_t RleEncoderV1::flush() { - writeValues(); - outputStream->BackUp(static_cast(bufferLength - bufferPosition)); + finishEncode(); uint64_t dataSize = outputStream->flush(); - bufferLength = bufferPosition = 0; return dataSize; } @@ -135,6 +133,11 @@ namespace orc { } } + void RleEncoderV1::finishEncode() { + writeValues(); + RleEncoder::finishEncode(); + } + signed char RleDecoderV1::readByte() { SCOPED_MINUS_STOPWATCH(metrics, DecodingLatencyUs); if (bufferStart_ == bufferEnd_) { diff --git a/c++/src/RLEv1.hh b/c++/src/RLEv1.hh index a2a00c9305..024b1e5e97 100644 --- a/c++/src/RLEv1.hh +++ b/c++/src/RLEv1.hh @@ -38,6 +38,8 @@ namespace orc { void write(int64_t val) override; + void finishEncode() override; + private: int64_t delta_; bool repeat_; diff --git a/c++/src/RLEv2.hh b/c++/src/RLEv2.hh index a8e0340e7e..8ceb7f125b 100644 --- a/c++/src/RLEv2.hh +++ b/c++/src/RLEv2.hh @@ -108,6 +108,8 @@ namespace orc { void write(int64_t val) override; + void finishEncode() override; + private: const bool alignedBitPacking_; uint32_t fixedRunLength_; diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc index 8a43818a53..17bf835203 100644 --- a/c++/src/Reader.cc +++ b/c++/src/Reader.cc @@ -751,27 +751,35 @@ namespace orc { return *(contents_->schema.get()); } - std::unique_ptr ReaderImpl::getStripeStatistics(uint64_t stripeIndex) const { + std::unique_ptr ReaderImpl::getStripeStatistics(uint64_t stripeIndex, + bool includeRowIndex) const { if (!isMetadataLoaded_) { readMetadata(); } if (contents_->metadata == nullptr) { throw std::logic_error("No stripe statistics in file"); } - size_t num_cols = static_cast( - contents_->metadata->stripe_stats(static_cast(stripeIndex)).col_stats_size()); - std::vector> indexStats(num_cols); proto::StripeInformation currentStripeInfo = footer_->stripes(static_cast(stripeIndex)); proto::StripeFooter currentStripeFooter = getStripeFooter(currentStripeInfo, *contents_.get()); - getRowIndexStatistics(currentStripeInfo, stripeIndex, currentStripeFooter, &indexStats); - const Timezone& writerTZ = currentStripeFooter.has_writer_timezone() ? getTimezoneByName(currentStripeFooter.writer_timezone()) : getLocalTimezone(); StatContext statContext(hasCorrectStatistics(), &writerTZ); - return std::make_unique( + + if (!includeRowIndex) { + return std::make_unique( + contents_->metadata->stripe_stats(static_cast(stripeIndex)), statContext); + } + + size_t num_cols = static_cast( + contents_->metadata->stripe_stats(static_cast(stripeIndex)).col_stats_size()); + std::vector> indexStats(num_cols); + + getRowIndexStatistics(currentStripeInfo, stripeIndex, currentStripeFooter, &indexStats); + + return std::make_unique( contents_->metadata->stripe_stats(static_cast(stripeIndex)), indexStats, statContext); } @@ -1117,7 +1125,7 @@ namespace orc { } bool RowReaderImpl::next(ColumnVectorBatch& data) { - SCOPED_STOPWATCH(contents->readerMetrics, ReaderInclusiveLatencyUs, ReaderCall); + SCOPED_STOPWATCH(contents_->readerMetrics, ReaderInclusiveLatencyUs, ReaderCall); if (currentStripe_ >= lastStripe_) { data.numElements = 0; markEndOfFile(); @@ -1426,17 +1434,10 @@ namespace orc { uint32_t stripeIndex, const std::set& included) const { std::map ret; - // find stripe info - if (stripeIndex >= static_cast(footer_->stripes_size())) { - throw std::logic_error("Illegal stripe index: " + - to_string(static_cast(stripeIndex))); - } - const proto::StripeInformation currentStripeInfo = - footer_->stripes(static_cast(stripeIndex)); - const proto::StripeFooter currentStripeFooter = getStripeFooter(currentStripeInfo, *contents_); + uint64_t offset; + auto currentStripeFooter = loadCurrentStripeFooter(stripeIndex, offset); // iterate stripe footer to get stream of bloom_filter - uint64_t offset = static_cast(currentStripeInfo.offset()); for (int i = 0; i < currentStripeFooter.streams_size(); i++) { const proto::Stream& stream = currentStripeFooter.streams(i); uint32_t column = static_cast(stream.column()); @@ -1474,6 +1475,150 @@ namespace orc { return ret; } + proto::StripeFooter ReaderImpl::loadCurrentStripeFooter(uint32_t stripeIndex, + uint64_t& offset) const { + // find stripe info + if (stripeIndex >= static_cast(footer_->stripes_size())) { + throw std::logic_error("Illegal stripe index: " + + to_string(static_cast(stripeIndex))); + } + const proto::StripeInformation currentStripeInfo = + footer_->stripes(static_cast(stripeIndex)); + offset = static_cast(currentStripeInfo.offset()); + return getStripeFooter(currentStripeInfo, *contents_); + } + + std::map ReaderImpl::getRowGroupIndex( + uint32_t stripeIndex, const std::set& included) const { + std::map ret; + uint64_t offset; + auto currentStripeFooter = loadCurrentStripeFooter(stripeIndex, offset); + + // iterate stripe footer to get stream of row_index + for (int i = 0; i < currentStripeFooter.streams_size(); i++) { + const proto::Stream& stream = currentStripeFooter.streams(i); + uint32_t column = static_cast(stream.column()); + uint64_t length = static_cast(stream.length()); + RowGroupIndex& rowGroupIndex = ret[column]; + + if (stream.kind() == proto::Stream_Kind_ROW_INDEX && + (included.empty() || included.find(column) != included.end())) { + std::unique_ptr pbStream = + createDecompressor(contents_->compression, + std::make_unique( + contents_->stream.get(), offset, length, *contents_->pool), + contents_->blockSize, *(contents_->pool), contents_->readerMetrics); + + proto::RowIndex pbRowIndex; + if (!pbRowIndex.ParseFromZeroCopyStream(pbStream.get())) { + std::stringstream errMsgBuffer; + errMsgBuffer << "Failed to parse RowIndex at column " << column << " in stripe " + << stripeIndex; + throw ParseError(errMsgBuffer.str()); + } + + // add rowGroupIndex to result for one column + for (auto& rowIndexEntry : pbRowIndex.entry()) { + std::vector posVector; + for (auto& position : rowIndexEntry.positions()) { + posVector.push_back(position); + } + rowGroupIndex.positions.push_back(posVector); + } + } + offset += length; + } + return ret; + } + + void ReaderImpl::releaseBuffer(uint64_t boundary) { + std::lock_guard lock(contents_->readCacheMutex); + + if (contents_->readCache) { + contents_->readCache->evictEntriesBefore(boundary); + } + } + + void ReaderImpl::preBuffer(const std::vector& stripes, + const std::list& includeTypes) { + std::vector newStripes; + for (auto stripe : stripes) { + if (stripe < static_cast(footer_->stripes_size())) newStripes.push_back(stripe); + } + + std::list newIncludeTypes; + for (auto type : includeTypes) { + if (type < static_cast(footer_->types_size())) newIncludeTypes.push_back(type); + } + + if (newStripes.empty() || newIncludeTypes.empty()) { + return; + } + + orc::RowReaderOptions rowReaderOptions; + rowReaderOptions.includeTypes(newIncludeTypes); + ColumnSelector columnSelector(contents_.get()); + std::vector selectedColumns; + columnSelector.updateSelected(selectedColumns, rowReaderOptions); + + std::vector ranges; + ranges.reserve(newIncludeTypes.size()); + for (auto stripe : newStripes) { + // get stripe information + const auto& stripeInfo = footer_->stripes(stripe); + uint64_t stripeFooterStart = + stripeInfo.offset() + stripeInfo.index_length() + stripeInfo.data_length(); + uint64_t stripeFooterLength = stripeInfo.footer_length(); + + // get stripe footer + std::unique_ptr pbStream = createDecompressor( + contents_->compression, + std::make_unique(contents_->stream.get(), stripeFooterStart, + stripeFooterLength, *contents_->pool), + contents_->blockSize, *contents_->pool, contents_->readerMetrics); + proto::StripeFooter stripeFooter; + if (!stripeFooter.ParseFromZeroCopyStream(pbStream.get())) { + throw ParseError(std::string("bad StripeFooter from ") + pbStream->getName()); + } + + // traverse all streams in stripe footer, choose selected streams to prebuffer + uint64_t offset = stripeInfo.offset(); + for (int i = 0; i < stripeFooter.streams_size(); i++) { + const proto::Stream& stream = stripeFooter.streams(i); + if (offset + stream.length() > stripeFooterStart) { + std::stringstream msg; + msg << "Malformed stream meta at stream index " << i << " in stripe " << stripe + << ": streamOffset=" << offset << ", streamLength=" << stream.length() + << ", stripeOffset=" << stripeInfo.offset() + << ", stripeIndexLength=" << stripeInfo.index_length() + << ", stripeDataLength=" << stripeInfo.data_length(); + throw ParseError(msg.str()); + } + + if (stream.has_kind() && selectedColumns[stream.column()]) { + const auto& kind = stream.kind(); + if (kind == proto::Stream_Kind_DATA || kind == proto::Stream_Kind_DICTIONARY_DATA || + kind == proto::Stream_Kind_PRESENT || kind == proto::Stream_Kind_LENGTH || + kind == proto::Stream_Kind_SECONDARY) { + ranges.emplace_back(offset, stream.length()); + } + } + + offset += stream.length(); + } + + { + std::lock_guard lock(contents_->readCacheMutex); + + if (!contents_->readCache) { + contents_->readCache = std::make_shared( + getStream(), options_.getCacheOptions(), contents_->pool, contents_->readerMetrics); + } + contents_->readCache->cache(std::move(ranges)); + } + } + } + RowReader::~RowReader() { // PASS } diff --git a/c++/src/Reader.hh b/c++/src/Reader.hh index 630d812c38..3d81d26920 100644 --- a/c++/src/Reader.hh +++ b/c++/src/Reader.hh @@ -26,6 +26,8 @@ #include "ColumnReader.hh" #include "RLE.hh" +#include "io/Cache.hh" + #include "SchemaEvolution.hh" #include "TypeImpl.hh" #include "sargs/SargsApplier.hh" @@ -70,6 +72,11 @@ namespace orc { bool isDecimalAsLong; std::unique_ptr metadata; ReaderMetrics* readerMetrics; + + // mutex to protect readCache_ from concurrent access + std::mutex readCacheMutex; + // cached io ranges. only valid when preBuffer is invoked. + std::shared_ptr readCache; }; proto::StripeFooter getStripeFooter(const proto::StripeInformation& info, @@ -245,6 +252,10 @@ namespace orc { const SchemaEvolution* getSchemaEvolution() const { return &schemaEvolution_; } + + std::shared_ptr getReadCache() const { + return contents_->readCache; + } }; class ReaderImpl : public Reader { @@ -260,15 +271,16 @@ namespace orc { // footer proto::Footer* footer_; uint64_t numberOfStripes_; + uint64_t getMemoryUse(int stripeIx, std::vector& selectedColumns); // internal methods void readMetadata() const; void checkOrcVersion(); - void getRowIndexStatistics( - const proto::StripeInformation& stripeInfo, uint64_t stripeIndex, - const proto::StripeFooter& currentStripeFooter, - std::vector >* indexStats) const; + void getRowIndexStatistics(const proto::StripeInformation& stripeInfo, uint64_t stripeIndex, + const proto::StripeFooter& currentStripeFooter, + std::vector>* indexStats) const; + proto::StripeFooter loadCurrentStripeFooter(uint32_t stripeIndex, uint64_t& offset) const; // metadata mutable bool isMetadataLoaded_; @@ -318,7 +330,8 @@ namespace orc { const std::string& getStreamName() const override; - std::unique_ptr getStripeStatistics(uint64_t stripeIndex) const override; + std::unique_ptr getStripeStatistics( + uint64_t stripeIndex, bool includeRowIndex = true) const override; std::unique_ptr createRowReader() const override; @@ -374,6 +387,13 @@ namespace orc { std::map getBloomFilters( uint32_t stripeIndex, const std::set& included) const override; + + void preBuffer(const std::vector& stripes, + const std::list& includeTypes) override; + void releaseBuffer(uint64_t boundary) override; + + std::map getRowGroupIndex( + uint32_t stripeIndex, const std::set& included) const override; }; } // namespace orc diff --git a/c++/src/RleEncoderV2.cc b/c++/src/RleEncoderV2.cc index 18c5200254..1cda9ee91e 100644 --- a/c++/src/RleEncoderV2.cc +++ b/c++/src/RleEncoderV2.cc @@ -440,31 +440,8 @@ namespace orc { } uint64_t RleEncoderV2::flush() { - if (numLiterals != 0) { - EncodingOption option = {}; - if (variableRunLength_ != 0) { - determineEncoding(option); - writeValues(option); - } else if (fixedRunLength_ != 0) { - if (fixedRunLength_ < MIN_REPEAT) { - variableRunLength_ = fixedRunLength_; - fixedRunLength_ = 0; - determineEncoding(option); - writeValues(option); - } else if (fixedRunLength_ >= MIN_REPEAT && fixedRunLength_ <= MAX_SHORT_REPEAT_LENGTH) { - option.encoding = SHORT_REPEAT; - writeValues(option); - } else { - option.encoding = DELTA; - option.isFixedDelta = true; - writeValues(option); - } - } - } - - outputStream->BackUp(static_cast(bufferLength - bufferPosition)); + finishEncode(); uint64_t dataSize = outputStream->flush(); - bufferLength = bufferPosition = 0; return dataSize; } @@ -779,4 +756,30 @@ namespace orc { fixedRunLength_ = 1; variableRunLength_ = 1; } + + void RleEncoderV2::finishEncode() { + if (numLiterals != 0) { + EncodingOption option = {}; + if (variableRunLength_ != 0) { + determineEncoding(option); + writeValues(option); + } else if (fixedRunLength_ != 0) { + if (fixedRunLength_ < MIN_REPEAT) { + variableRunLength_ = fixedRunLength_; + fixedRunLength_ = 0; + determineEncoding(option); + writeValues(option); + } else if (fixedRunLength_ >= MIN_REPEAT && fixedRunLength_ <= MAX_SHORT_REPEAT_LENGTH) { + option.encoding = SHORT_REPEAT; + writeValues(option); + } else { + option.encoding = DELTA; + option.isFixedDelta = true; + writeValues(option); + } + } + } + + RleEncoder::finishEncode(); + } } // namespace orc diff --git a/c++/src/SchemaEvolution.cc b/c++/src/SchemaEvolution.cc index 4099818ff9..7cf3b5c512 100644 --- a/c++/src/SchemaEvolution.cc +++ b/c++/src/SchemaEvolution.cc @@ -80,7 +80,7 @@ namespace orc { if (readType.getKind() == fileType.getKind()) { ret.isValid = true; if (fileType.getKind() == CHAR || fileType.getKind() == VARCHAR) { - ret.isValid = readType.getMaximumLength() == fileType.getMaximumLength(); + ret.needConvert = readType.getMaximumLength() != fileType.getMaximumLength(); } else if (fileType.getKind() == DECIMAL) { ret.needConvert = readType.getPrecision() != fileType.getPrecision() || readType.getScale() != fileType.getScale(); @@ -105,7 +105,11 @@ namespace orc { } case STRING: case CHAR: - case VARCHAR: + case VARCHAR: { + ret.isValid = ret.needConvert = isStringVariant(readType) || isNumeric(readType) || + isTimestamp(readType) || isDecimal(readType); + break; + } case TIMESTAMP: case TIMESTAMP_INSTANT: case DATE: diff --git a/c++/src/Statistics.cc b/c++/src/Statistics.cc index f9581215b3..c1a23cad16 100644 --- a/c++/src/Statistics.cc +++ b/c++/src/Statistics.cc @@ -81,11 +81,20 @@ namespace orc { // PASS } - StripeStatisticsImpl::StripeStatisticsImpl( + StripeStatisticsImpl::StripeStatisticsImpl(const proto::StripeStatistics& stripeStats, + const StatContext& statContext) { + columnStats_ = std::make_unique(stripeStats, statContext); + } + + StripeStatisticsWithRowGroupIndexImpl::~StripeStatisticsWithRowGroupIndexImpl() { + // PASS + } + + StripeStatisticsWithRowGroupIndexImpl::StripeStatisticsWithRowGroupIndexImpl( const proto::StripeStatistics& stripeStats, std::vector >& indexStats, - const StatContext& statContext) { - columnStats_ = std::make_unique(stripeStats, statContext); + const StatContext& statContext) + : StripeStatisticsImpl(stripeStats, statContext) { rowIndexStats_.resize(indexStats.size()); for (size_t i = 0; i < rowIndexStats_.size(); i++) { for (size_t j = 0; j < indexStats[i].size(); j++) { @@ -181,13 +190,13 @@ namespace orc { ColumnStatisticsImpl::ColumnStatisticsImpl(const proto::ColumnStatistics& pb) { stats_.setNumberOfValues(pb.number_of_values()); - stats_.setHasNull(pb.has_null()); + stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true); } BinaryColumnStatisticsImpl::BinaryColumnStatisticsImpl(const proto::ColumnStatistics& pb, const StatContext& statContext) { stats_.setNumberOfValues(pb.number_of_values()); - stats_.setHasNull(pb.has_null()); + stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true); if (pb.has_binary_statistics() && statContext.correctStats) { stats_.setHasTotalLength(pb.binary_statistics().has_sum()); stats_.setTotalLength(static_cast(pb.binary_statistics().sum())); @@ -197,7 +206,7 @@ namespace orc { BooleanColumnStatisticsImpl::BooleanColumnStatisticsImpl(const proto::ColumnStatistics& pb, const StatContext& statContext) { stats_.setNumberOfValues(pb.number_of_values()); - stats_.setHasNull(pb.has_null()); + stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true); if (pb.has_bucket_statistics() && statContext.correctStats) { hasCount_ = true; trueCount_ = pb.bucket_statistics().count(0); @@ -210,7 +219,7 @@ namespace orc { DateColumnStatisticsImpl::DateColumnStatisticsImpl(const proto::ColumnStatistics& pb, const StatContext& statContext) { stats_.setNumberOfValues(pb.number_of_values()); - stats_.setHasNull(pb.has_null()); + stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true); if (!pb.has_date_statistics() || !statContext.correctStats) { // hasMinimum_ is false by default; // hasMaximum_ is false by default; @@ -227,7 +236,7 @@ namespace orc { DecimalColumnStatisticsImpl::DecimalColumnStatisticsImpl(const proto::ColumnStatistics& pb, const StatContext& statContext) { stats_.setNumberOfValues(pb.number_of_values()); - stats_.setHasNull(pb.has_null()); + stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true); if (pb.has_decimal_statistics() && statContext.correctStats) { const proto::DecimalStatistics& stats = pb.decimal_statistics(); stats_.setHasMinimum(stats.has_minimum()); @@ -242,7 +251,7 @@ namespace orc { DoubleColumnStatisticsImpl::DoubleColumnStatisticsImpl(const proto::ColumnStatistics& pb) { stats_.setNumberOfValues(pb.number_of_values()); - stats_.setHasNull(pb.has_null()); + stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true); if (!pb.has_double_statistics()) { stats_.setMinimum(0); stats_.setMaximum(0); @@ -261,7 +270,7 @@ namespace orc { IntegerColumnStatisticsImpl::IntegerColumnStatisticsImpl(const proto::ColumnStatistics& pb) { stats_.setNumberOfValues(pb.number_of_values()); - stats_.setHasNull(pb.has_null()); + stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true); if (!pb.has_int_statistics()) { stats_.setMinimum(0); stats_.setMaximum(0); @@ -281,7 +290,7 @@ namespace orc { StringColumnStatisticsImpl::StringColumnStatisticsImpl(const proto::ColumnStatistics& pb, const StatContext& statContext) { stats_.setNumberOfValues(pb.number_of_values()); - stats_.setHasNull(pb.has_null()); + stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true); if (!pb.has_string_statistics() || !statContext.correctStats) { stats_.setTotalLength(0); } else { @@ -299,7 +308,7 @@ namespace orc { TimestampColumnStatisticsImpl::TimestampColumnStatisticsImpl(const proto::ColumnStatistics& pb, const StatContext& statContext) { stats_.setNumberOfValues(pb.number_of_values()); - stats_.setHasNull(pb.has_null()); + stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true); if (!pb.has_timestamp_statistics() || !statContext.correctStats) { stats_.setMinimum(0); stats_.setMaximum(0); @@ -365,7 +374,7 @@ namespace orc { CollectionColumnStatisticsImpl::CollectionColumnStatisticsImpl( const proto::ColumnStatistics& pb) { stats_.setNumberOfValues(pb.number_of_values()); - stats_.setHasNull(pb.has_null()); + stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true); if (!pb.has_collection_statistics()) { stats_.setMinimum(0); stats_.setMaximum(0); diff --git a/c++/src/Statistics.hh b/c++/src/Statistics.hh index 6f212c15cc..b7ed5d1e56 100644 --- a/c++/src/Statistics.hh +++ b/c++/src/Statistics.hh @@ -1713,7 +1713,6 @@ namespace orc { class StripeStatisticsImpl : public StripeStatistics { private: std::unique_ptr columnStats_; - std::vector > > rowIndexStats_; // DELIBERATELY NOT IMPLEMENTED StripeStatisticsImpl(const StripeStatisticsImpl&); @@ -1721,7 +1720,6 @@ namespace orc { public: StripeStatisticsImpl(const proto::StripeStatistics& stripeStats, - std::vector >& indexStats, const StatContext& statContext); virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId) const override { @@ -1732,13 +1730,38 @@ namespace orc { return columnStats_->getNumberOfColumns(); } + virtual const ColumnStatistics* getRowIndexStatistics(uint32_t, uint32_t) const override { + throw NotImplementedYet("set includeRowIndex true to get row index stats"); + } + + virtual ~StripeStatisticsImpl() override; + + virtual uint32_t getNumberOfRowIndexStats(uint32_t) const override { + throw NotImplementedYet("set includeRowIndex true to get row index stats"); + } + }; + + class StripeStatisticsWithRowGroupIndexImpl : public StripeStatisticsImpl { + private: + std::vector > > rowIndexStats_; + + // DELIBERATELY NOT IMPLEMENTED + StripeStatisticsWithRowGroupIndexImpl(const StripeStatisticsWithRowGroupIndexImpl&); + StripeStatisticsWithRowGroupIndexImpl& operator=(const StripeStatisticsWithRowGroupIndexImpl&); + + public: + StripeStatisticsWithRowGroupIndexImpl( + const proto::StripeStatistics& stripeStats, + std::vector >& indexStats, + const StatContext& statContext); + virtual const ColumnStatistics* getRowIndexStatistics(uint32_t columnId, uint32_t rowIndex) const override { // check id indices are valid return rowIndexStats_[columnId][rowIndex].get(); } - virtual ~StripeStatisticsImpl() override; + virtual ~StripeStatisticsWithRowGroupIndexImpl() override; uint32_t getNumberOfRowIndexStats(uint32_t columnId) const override { return static_cast(rowIndexStats_[columnId].size()); diff --git a/c++/src/StripeStream.cc b/c++/src/StripeStream.cc index f4345c0871..a5609f7629 100644 --- a/c++/src/StripeStream.cc +++ b/c++/src/StripeStream.cc @@ -19,6 +19,7 @@ #include "StripeStream.hh" #include "RLE.hh" #include "Reader.hh" +#include "io/Cache.hh" #include "orc/Exceptions.hh" #include "wrap/coded-stream-wrapper.h" @@ -37,7 +38,8 @@ namespace orc { stripeStart_(stripeStart), input_(input), writerTimezone_(writerTimezone), - readerTimezone_(readerTimezone) { + readerTimezone_(readerTimezone), + readCache_(reader.getReadCache()) { // PASS } @@ -89,7 +91,6 @@ namespace orc { if (stream.has_kind() && stream.kind() == kind && stream.column() == static_cast(columnId)) { uint64_t streamLength = stream.length(); - uint64_t myBlock = shouldStream ? input_.getNaturalReadSize() : streamLength; if (offset + streamLength > dataEnd) { std::stringstream msg; msg << "Malformed stream meta at stream index " << i << " in stripe " << stripeIndex_ @@ -99,9 +100,23 @@ namespace orc { << ", stripeDataLength=" << stripeInfo_.data_length(); throw ParseError(msg.str()); } - return createDecompressor(reader_.getCompression(), - std::make_unique( - &input_, offset, stream.length(), *pool, myBlock), + + BufferSlice slice; + if (readCache_) { + ReadRange range{offset, streamLength}; + slice = readCache_->read(range); + } + + uint64_t myBlock = shouldStream ? input_.getNaturalReadSize() : streamLength; + std::unique_ptr seekableInput; + if (slice.buffer) { + seekableInput = std::make_unique( + slice.buffer->data() + slice.offset, slice.length); + } else { + seekableInput = std::make_unique(&input_, offset, streamLength, + *pool, myBlock); + } + return createDecompressor(reader_.getCompression(), std::move(seekableInput), reader_.getCompressionSize(), *pool, reader_.getFileContents().readerMetrics); } diff --git a/c++/src/StripeStream.hh b/c++/src/StripeStream.hh index ad82d472c2..2d26f8575e 100644 --- a/c++/src/StripeStream.hh +++ b/c++/src/StripeStream.hh @@ -30,6 +30,7 @@ namespace orc { class RowReaderImpl; + class ReadRangeCache; /** * StripeStream Implementation @@ -45,6 +46,7 @@ namespace orc { InputStream& input_; const Timezone& writerTimezone_; const Timezone& readerTimezone_; + std::shared_ptr readCache_; public: StripeStreamsImpl(const RowReaderImpl& reader, uint64_t index, diff --git a/c++/src/Timezone.cc b/c++/src/Timezone.cc index 32276a850d..384f8ea99f 100644 --- a/c++/src/Timezone.cc +++ b/c++/src/Timezone.cc @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -655,25 +656,24 @@ namespace orc { epoch_ = utcEpoch - getVariant(utcEpoch).gmtOffset; } - const char* getTimezoneDirectory() { + std::string getTimezoneDirectory() { const char* dir = getenv("TZDIR"); if (!dir) { - dir = DEFAULT_TZDIR; + // this is present if we're in an activated conda environment + const char* condaPrefix = getenv("CONDA_PREFIX"); + if (condaPrefix) { + std::string condaDir(condaPrefix); + condaDir += "/share/zoneinfo"; + return condaDir; + } else { + dir = DEFAULT_TZDIR; + } } return dir; } - /** - * Get a timezone by absolute filename. - * Results are cached. - */ - const Timezone& getTimezoneByFilename(const std::string& filename) { - // ORC-110 - std::lock_guard timezone_lock(timezone_mutex); - std::map >::iterator itr = timezoneCache.find(filename); - if (itr != timezoneCache.end()) { - return *(itr->second).get(); - } + static std::vector loadTZDB(const std::string& filename) { + std::vector buffer; if (!fileExists(filename.c_str())) { std::stringstream ss; ss << "Time zone file " << filename << " does not exist." @@ -683,12 +683,65 @@ namespace orc { try { std::unique_ptr file = readFile(filename); size_t size = static_cast(file->getLength()); - std::vector buffer(size); + buffer.resize(size); file->read(&buffer[0], size, 0); - timezoneCache[filename] = std::make_shared(filename, buffer); } catch (ParseError& err) { throw TimezoneError(err.what()); } + return buffer; + } + + class LazyTimezone : public Timezone { + private: + std::string filename_; + mutable std::unique_ptr impl_; + mutable std::once_flag initialized_; + + TimezoneImpl* getImpl() const { + std::call_once(initialized_, [&]() { + auto buffer = loadTZDB(filename_); + impl_ = std::make_unique(filename_, std::move(buffer)); + }); + return impl_.get(); + } + + public: + LazyTimezone(const std::string& filename) : filename_(filename) {} + + const TimezoneVariant& getVariant(int64_t clk) const override { + return getImpl()->getVariant(clk); + } + int64_t getEpoch() const override { + return getImpl()->getEpoch(); + } + void print(std::ostream& os) const override { + return getImpl()->print(os); + } + uint64_t getVersion() const override { + return getImpl()->getVersion(); + } + + int64_t convertToUTC(int64_t clk) const override { + return getImpl()->convertToUTC(clk); + } + + int64_t convertFromUTC(int64_t clk) const override { + return getImpl()->convertFromUTC(clk); + } + }; + + /** + * Get a timezone by absolute filename. + * Results are cached. + */ + const Timezone& getTimezoneByFilename(const std::string& filename) { + // ORC-110 + std::lock_guard timezone_lock(timezone_mutex); + std::map >::iterator itr = timezoneCache.find(filename); + if (itr != timezoneCache.end()) { + return *(itr->second).get(); + } + timezoneCache[filename] = std::make_shared(filename); return *timezoneCache[filename].get(); } diff --git a/c++/src/TypeImpl.cc b/c++/src/TypeImpl.cc index c7b073c713..cbc7b82796 100644 --- a/c++/src/TypeImpl.cc +++ b/c++/src/TypeImpl.cc @@ -660,7 +660,8 @@ namespace orc { std::pair nameRes = parseName(input, pos, end); pos = nameRes.second; if (input[pos] != ':') { - throw std::logic_error("Invalid struct type. No field name set."); + throw std::logic_error("Invalid struct type. Field name can not contain '" + + std::string(1, input[pos]) + "'."); } std::pair, size_t> typeRes = TypeImpl::parseType(input, ++pos, end); result->addStructField(nameRes.first, std::move(typeRes.first)); diff --git a/c++/src/Utils.hh b/c++/src/Utils.hh index 4a609788f9..851d0af15c 100644 --- a/c++/src/Utils.hh +++ b/c++/src/Utils.hh @@ -21,6 +21,7 @@ #include #include +#include namespace orc { @@ -70,6 +71,75 @@ namespace orc { #define SCOPED_MINUS_STOPWATCH(METRICS_PTR, LATENCY_VAR) #endif + struct Utf8Utils { + /** + * Counts how many utf-8 chars of the input data + */ + static uint64_t charLength(const char* data, uint64_t length) { + uint64_t chars = 0; + for (uint64_t i = 0; i < length; i++) { + if (isUtfStartByte(data[i])) { + chars++; + } + } + return chars; + } + + /** + * Return the number of bytes required to read at most maxCharLength + * characters in full from a utf-8 encoded byte array provided + * by data. This does not validate utf-8 data, but + * operates correctly on already valid utf-8 data. + * + * @param maxCharLength number of characters required + * @param data the bytes of UTF-8 + * @param length the length of data to truncate + */ + static uint64_t truncateBytesTo(uint64_t maxCharLength, const char* data, uint64_t length) { + uint64_t chars = 0; + if (length <= maxCharLength) { + return length; + } + for (uint64_t i = 0; i < length; i++) { + if (isUtfStartByte(data[i])) { + chars++; + } + if (chars > maxCharLength) { + return i; + } + } + // everything fits + return length; + } + + /** + * Checks if b is the first byte of a UTF-8 character. + */ + inline static bool isUtfStartByte(char b) { + return (b & 0xC0) != 0x80; + } + + /** + * Find the start of the last character that ends in the current string. + * @param text the bytes of the utf-8 + * @param from the first byte location + * @param until the last byte location + * @return the index of the last character + */ + static uint64_t findLastCharacter(const char* text, uint64_t from, uint64_t until) { + uint64_t posn = until; + /* we don't expect characters more than 5 bytes */ + while (posn >= from) { + if (isUtfStartByte(text[posn])) { + return posn; + } + posn -= 1; + } + /* beginning of a valid char not found */ + throw std::logic_error("Could not truncate string, beginning of a valid char not found"); + } + }; + } // namespace orc #endif diff --git a/c++/src/Vector.cc b/c++/src/Vector.cc index bc44469959..49f47aeb03 100644 --- a/c++/src/Vector.cc +++ b/c++/src/Vector.cc @@ -34,6 +34,7 @@ namespace orc { notNull(pool, cap), hasNulls(false), isEncoded(false), + dictionaryDecoded(false), memoryPool(pool) { std::memset(notNull.data(), 1, capacity); } @@ -61,6 +62,13 @@ namespace orc { return false; } + void ColumnVectorBatch::decodeDictionary() { + if (dictionaryDecoded) return; + + decodeDictionaryImpl(); + dictionaryDecoded = true; + } + StringDictionary::StringDictionary(MemoryPool& pool) : dictionaryBlob(pool), dictionaryOffset(pool) { // PASS @@ -88,6 +96,17 @@ namespace orc { } } + void EncodedStringVectorBatch::decodeDictionaryImpl() { + size_t n = index.size(); + resize(n); + + for (size_t i = 0; i < n; ++i) { + if (!hasNulls || notNull[i]) { + dictionary->getValueByIndex(index[i], data[i], length[i]); + } + } + } + StringVectorBatch::StringVectorBatch(uint64_t capacity, MemoryPool& pool) : ColumnVectorBatch(capacity, pool), data(pool, capacity), @@ -174,6 +193,12 @@ namespace orc { return false; } + void StructVectorBatch::decodeDictionaryImpl() { + for (const auto& field : fields) { + field->decodeDictionary(); + } + } + ListVectorBatch::ListVectorBatch(uint64_t cap, MemoryPool& pool) : ColumnVectorBatch(cap, pool), offsets(pool, cap + 1) { offsets.zeroOut(); @@ -211,6 +236,10 @@ namespace orc { return true; } + void ListVectorBatch::decodeDictionaryImpl() { + elements->decodeDictionary(); + } + MapVectorBatch::MapVectorBatch(uint64_t cap, MemoryPool& pool) : ColumnVectorBatch(cap, pool), offsets(pool, cap + 1) { offsets.zeroOut(); @@ -251,6 +280,16 @@ namespace orc { return true; } + void MapVectorBatch::decodeDictionaryImpl() { + if (keys) { + keys->decodeDictionary(); + } + + if (elements) { + elements->decodeDictionary(); + } + } + UnionVectorBatch::UnionVectorBatch(uint64_t cap, MemoryPool& pool) : ColumnVectorBatch(cap, pool), tags(pool, cap), offsets(pool, cap) { tags.zeroOut(); @@ -310,6 +349,12 @@ namespace orc { return false; } + void UnionVectorBatch::decodeDictionaryImpl() { + for (const auto& child : children) { + child->decodeDictionary(); + } + } + Decimal64VectorBatch::Decimal64VectorBatch(uint64_t cap, MemoryPool& pool) : ColumnVectorBatch(cap, pool), precision(0), diff --git a/c++/src/Writer.cc b/c++/src/Writer.cc index fceac7c2fb..775e6d2452 100644 --- a/c++/src/Writer.cc +++ b/c++/src/Writer.cc @@ -46,6 +46,8 @@ namespace orc { WriterMetrics* metrics; bool useTightNumericVector; uint64_t outputBufferCapacity; + uint64_t memoryBlockSize; + bool alignBlockBoundToRowGroup; WriterOptionsPrivate() : fileVersion(FileVersion::v_0_12()) { // default to Hive_0_12 stripeSize = 64 * 1024 * 1024; // 64M @@ -67,6 +69,8 @@ namespace orc { metrics = nullptr; useTightNumericVector = false; outputBufferCapacity = 1024 * 1024; + memoryBlockSize = 64 * 1024; // 64K + alignBlockBoundToRowGroup = false; } }; @@ -287,6 +291,24 @@ namespace orc { return privateBits_->outputBufferCapacity; } + WriterOptions& WriterOptions::setMemoryBlockSize(uint64_t capacity) { + privateBits_->memoryBlockSize = capacity; + return *this; + } + + uint64_t WriterOptions::getMemoryBlockSize() const { + return privateBits_->memoryBlockSize; + } + + WriterOptions& WriterOptions::setAlignBlockBoundToRowGroup(bool alignBlockBoundToRowGroup) { + privateBits_->alignBlockBoundToRowGroup = alignBlockBoundToRowGroup; + return *this; + } + + bool WriterOptions::getAlignBlockBoundToRowGroup() const { + return privateBits_->alignBlockBoundToRowGroup; + } + Writer::~Writer() { // PASS } @@ -352,11 +374,16 @@ namespace orc { useTightNumericVector_ = opts.getUseTightNumericVector(); + if (options_.getCompressionBlockSize() % options_.getMemoryBlockSize() != 0) { + throw std::invalid_argument( + "Compression block size must be a multiple of memory block size."); + } + // compression stream for stripe footer, file footer and metadata - compressionStream_ = - createCompressor(options_.getCompression(), outStream_, options_.getCompressionStrategy(), - options_.getOutputBufferCapacity(), options_.getCompressionBlockSize(), - *options_.getMemoryPool(), options_.getWriterMetrics()); + compressionStream_ = createCompressor( + options_.getCompression(), outStream_, options_.getCompressionStrategy(), + options_.getOutputBufferCapacity(), options_.getCompressionBlockSize(), + options_.getMemoryBlockSize(), *options_.getMemoryPool(), options_.getWriterMetrics()); // uncompressed stream for post script bufferedStream_.reset(new BufferedOutputStream(*options_.getMemoryPool(), outStream_, @@ -385,6 +412,9 @@ namespace orc { stripeRows_ += chunkSize; if (indexRows_ >= rowIndexStride) { + if (options_.getAlignBlockBoundToRowGroup()) { + columnWriter_->finishStreams(); + } columnWriter_->createRowIndexEntry(); indexRows_ = 0; } @@ -437,7 +467,7 @@ namespace orc { // Write file header const static size_t magicIdLength = strlen(WriterImpl::magicId); { - SCOPED_STOPWATCH(options.getWriterMetrics(), IOBlockingLatencyUs, IOCount); + SCOPED_STOPWATCH(options_.getWriterMetrics(), IOBlockingLatencyUs, IOCount); outStream_->write(WriterImpl::magicId, magicIdLength); } currentOffset_ += magicIdLength; @@ -585,7 +615,7 @@ namespace orc { throw std::logic_error("Failed to write post script."); } unsigned char psLength = static_cast(bufferedStream_->flush()); - SCOPED_STOPWATCH(options.getWriterMetrics(), IOBlockingLatencyUs, IOCount); + SCOPED_STOPWATCH(options_.getWriterMetrics(), IOBlockingLatencyUs, IOCount); outStream_->write(&psLength, sizeof(unsigned char)); } diff --git a/c++/src/io/Cache.cc b/c++/src/io/Cache.cc new file mode 100644 index 0000000000..39f63fdd2b --- /dev/null +++ b/c++/src/io/Cache.cc @@ -0,0 +1,171 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "Cache.hh" + +namespace orc { + + std::vector ReadRangeCombiner::coalesce(std::vector ranges) const { + if (ranges.empty()) { + return ranges; + } + + // Remove zero-sized ranges + auto end = std::remove_if(ranges.begin(), ranges.end(), + [](const ReadRange& range) { return range.length == 0; }); + // Sort in position order + std::sort(ranges.begin(), end, [](const ReadRange& a, const ReadRange& b) { + return a.offset != b.offset ? a.offset < b.offset : a.length > b.length; + }); + + // Remove ranges that overlap 100% + std::vector uniqueRanges; + uniqueRanges.reserve(ranges.size()); + for (auto it = ranges.begin(); it != end; ++it) { + if (uniqueRanges.empty() || !uniqueRanges.back().contains(*it)) { + uniqueRanges.push_back(*it); + } + } + ranges = std::move(uniqueRanges); + + // Skip further processing if ranges is empty after removing zero-sized ranges. + if (ranges.empty()) { + return ranges; + } + +#ifndef NDEBUG + for (size_t i = 0; i < ranges.size() - 1; ++i) { + const auto& left = ranges[i]; + const auto& right = ranges[i + 1]; + assert(left.offset < right.offset); + assert(!left.contains(right)); + } +#endif + + std::vector coalesced; + auto itr = ranges.begin(); + + // Start of the current coalesced range and end (exclusive) of previous range. + // Both are initialized with the start of first range which is a placeholder value. + uint64_t coalescedStart = itr->offset; + uint64_t coalescedEnd = coalescedStart + itr->length; + + for (++itr; itr < ranges.end(); ++itr) { + const uint64_t currentRangeStart = itr->offset; + const uint64_t currentRangeEnd = currentRangeStart + itr->length; + + assert(coalescedStart < coalescedEnd); + assert(currentRangeStart < currentRangeEnd); + + // At this point, the coalesced range is [coalesced_start, prev_range_end). + // Stop coalescing if: + // - coalesced range is too large, or + // - distance (hole/gap) between consecutive ranges is too large. + if ((currentRangeEnd - coalescedStart > rangeSizeLimit) || + (currentRangeStart > coalescedEnd + holeSizeLimit)) { + coalesced.push_back({coalescedStart, coalescedEnd - coalescedStart}); + coalescedStart = currentRangeStart; + } + + // Update the prev_range_end with the current range. + coalescedEnd = currentRangeEnd; + } + coalesced.push_back({coalescedStart, coalescedEnd - coalescedStart}); + + assert(coalesced.front().offset == ranges.front().offset); + assert(coalesced.back().offset + coalesced.back().length == + ranges.back().offset + ranges.back().length); + return coalesced; + } + + std::vector ReadRangeCombiner::coalesceReadRanges(std::vector ranges, + uint64_t holeSizeLimit, + uint64_t rangeSizeLimit) { + assert(rangeSizeLimit > holeSizeLimit); + + ReadRangeCombiner combiner{holeSizeLimit, rangeSizeLimit}; + return combiner.coalesce(std::move(ranges)); + } + + void ReadRangeCache::cache(std::vector ranges) { + ranges = ReadRangeCombiner::coalesceReadRanges(std::move(ranges), options_.holeSizeLimit, + options_.rangeSizeLimit); + + std::vector newEntries = makeCacheEntries(ranges); + // Add new entries, themselves ordered by offset + if (entries_.size() > 0) { + std::vector merged(entries_.size() + newEntries.size()); + std::merge(entries_.begin(), entries_.end(), newEntries.begin(), newEntries.end(), + merged.begin()); + entries_ = std::move(merged); + } else { + entries_ = std::move(newEntries); + } + } + + BufferSlice ReadRangeCache::read(const ReadRange& range) { + if (range.length == 0) { + return {std::make_shared(*memoryPool_, 0), 0, 0}; + } + + const auto it = std::lower_bound(entries_.begin(), entries_.end(), range, + [](const RangeCacheEntry& entry, const ReadRange& range) { + return entry.range.offset + entry.range.length < + range.offset + range.length; + }); + + BufferSlice result{}; + bool hit_cache = false; + if (it != entries_.end() && it->range.contains(range)) { + hit_cache = it->future.valid(); + it->future.get(); + result = BufferSlice{it->buffer, range.offset - it->range.offset, range.length}; + } + + if (metrics_) { + if (hit_cache) + metrics_->ReadRangeCacheHits.fetch_add(1); + else + metrics_->ReadRangeCacheMisses.fetch_add(1); + } + return result; + } + + void ReadRangeCache::evictEntriesBefore(uint64_t boundary) { + auto it = std::lower_bound(entries_.begin(), entries_.end(), boundary, + [](const RangeCacheEntry& entry, uint64_t offset) { + return entry.range.offset + entry.range.length <= offset; + }); + entries_.erase(entries_.begin(), it); + } + + std::vector ReadRangeCache::makeCacheEntries( + const std::vector& ranges) const { + std::vector newEntries; + newEntries.reserve(ranges.size()); + for (const auto& range : ranges) { + BufferPtr buffer = std::make_shared(*memoryPool_, range.length); + std::future future = stream_->readAsync(buffer->data(), buffer->size(), range.offset); + newEntries.emplace_back(range, std::move(buffer), std::move(future)); + } + return newEntries; + } + +} // namespace orc diff --git a/c++/src/io/Cache.hh b/c++/src/io/Cache.hh new file mode 100644 index 0000000000..7fc79718aa --- /dev/null +++ b/c++/src/io/Cache.hh @@ -0,0 +1,122 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "orc/MemoryPool.hh" +#include "orc/OrcFile.hh" + +#include +#include +#include +#include +#include +#include + +namespace orc { + + struct ReadRange { + uint64_t offset; + uint64_t length; + + ReadRange() = default; + ReadRange(uint64_t offset, uint64_t length) : offset(offset), length(length) {} + + friend bool operator==(const ReadRange& left, const ReadRange& right) { + return (left.offset == right.offset && left.length == right.length); + } + friend bool operator!=(const ReadRange& left, const ReadRange& right) { + return !(left == right); + } + + bool contains(const ReadRange& other) const { + return (offset <= other.offset && offset + length >= other.offset + other.length); + } + }; + + struct ReadRangeCombiner { + const uint64_t holeSizeLimit; + const uint64_t rangeSizeLimit; + + std::vector coalesce(std::vector ranges) const; + + static std::vector coalesceReadRanges(std::vector ranges, + uint64_t holeSizeLimit, + uint64_t rangeSizeLimit); + }; + + using Buffer = DataBuffer; + using BufferPtr = std::shared_ptr; + + struct RangeCacheEntry { + ReadRange range; + BufferPtr buffer; + std::shared_future future; // use shared_future in case of multiple get calls + + RangeCacheEntry() = default; + RangeCacheEntry(const ReadRange& range, BufferPtr buffer, std::future future) + : range(range), buffer(std::move(buffer)), future(std::move(future).share()) {} + + friend bool operator<(const RangeCacheEntry& left, const RangeCacheEntry& right) { + return left.range.offset < right.range.offset; + } + }; + + struct BufferSlice { + BufferPtr buffer = nullptr; + uint64_t offset = 0; + uint64_t length = 0; + }; + + /// A read cache designed to hide IO latencies when reading. + class ReadRangeCache { + public: + /// Construct a read cache with given options + explicit ReadRangeCache(InputStream* stream, CacheOptions options, MemoryPool* memoryPool, + ReaderMetrics* metrics = nullptr) + : stream_(stream), + options_(std::move(options)), + memoryPool_(memoryPool), + metrics_(metrics) {} + + ~ReadRangeCache() = default; + + /// Cache the given ranges in the background. + /// + /// The caller must ensure that the ranges do not overlap with each other, + /// nor with previously cached ranges. Otherwise, behaviour will be undefined. + void cache(std::vector ranges); + + /// Read a range previously given to Cache(). + BufferSlice read(const ReadRange& range); + + /// Evict cache entries with its range before given boundary. + void evictEntriesBefore(uint64_t boundary); + + private: + std::vector makeCacheEntries(const std::vector& ranges) const; + + InputStream* stream_; + CacheOptions options_; + // Ordered by offset (so as to find a matching region by binary search) + std::vector entries_; + MemoryPool* memoryPool_; + ReaderMetrics* metrics_; + }; + +} // namespace orc diff --git a/c++/src/io/OutputStream.cc b/c++/src/io/OutputStream.cc index 6fc68e262f..fbf1ca61dd 100644 --- a/c++/src/io/OutputStream.cc +++ b/c++/src/io/OutputStream.cc @@ -61,6 +61,10 @@ namespace orc { } } + void BufferedOutputStream::finishStream() { + // PASS + } + google::protobuf::int64 BufferedOutputStream::ByteCount() const { return static_cast(dataBuffer_->size()); } @@ -87,7 +91,7 @@ namespace orc { uint64_t dataSize = dataBuffer_->size(); // flush data buffer into outputStream if (dataSize > 0) { - SCOPED_STOPWATCH(metrics, IOBlockingLatencyUs, IOCount); + SCOPED_STOPWATCH(metrics_, IOBlockingLatencyUs, IOCount); dataBuffer_->writeTo(outputStream_, metrics_); } dataBuffer_->resize(0); @@ -98,6 +102,10 @@ namespace orc { dataBuffer_->resize(0); } + uint64_t BufferedOutputStream::getRawInputBufferSize() const { + throw std::logic_error("getRawInputBufferSize is not supported."); + } + void AppendOnlyBufferedStream::write(const char* data, size_t size) { size_t dataOffset = 0; while (size > 0) { @@ -120,25 +128,31 @@ namespace orc { } uint64_t AppendOnlyBufferedStream::flush() { - outStream_->BackUp(bufferLength_ - bufferOffset_); - bufferOffset_ = bufferLength_ = 0; - buffer_ = nullptr; + finishStream(); return outStream_->flush(); } void AppendOnlyBufferedStream::recordPosition(PositionRecorder* recorder) const { uint64_t flushedSize = outStream_->getSize(); - uint64_t unflushedSize = static_cast(bufferOffset_); + uint64_t unusedBufferSize = static_cast(bufferLength_ - bufferOffset_); if (outStream_->isCompressed()) { // start of the compression chunk in the stream recorder->add(flushedSize); - // number of decompressed bytes that need to be consumed - recorder->add(unflushedSize); + // There are multiple blocks in the input buffer, but bufferPosition only records the + // effective length of the last block. We need rawInputBufferSize to record the total length + // of all variable blocks. + recorder->add(outStream_->getRawInputBufferSize() - unusedBufferSize); } else { - flushedSize -= static_cast(bufferLength_); // byte offset of the start location - recorder->add(flushedSize + unflushedSize); + recorder->add(flushedSize - unusedBufferSize); } } + void AppendOnlyBufferedStream::finishStream() { + outStream_->BackUp(bufferLength_ - bufferOffset_); + outStream_->finishStream(); + bufferOffset_ = bufferLength_ = 0; + buffer_ = nullptr; + } + } // namespace orc diff --git a/c++/src/io/OutputStream.hh b/c++/src/io/OutputStream.hh index c63bc805bb..6319de96d6 100644 --- a/c++/src/io/OutputStream.hh +++ b/c++/src/io/OutputStream.hh @@ -69,10 +69,12 @@ namespace orc { virtual uint64_t getSize() const; virtual uint64_t flush(); virtual void suppress(); + virtual uint64_t getRawInputBufferSize() const; virtual bool isCompressed() const { return false; } + virtual void finishStream(); }; DIAGNOSTIC_POP @@ -98,6 +100,7 @@ namespace orc { void write(const char* data, size_t size); uint64_t getSize() const; uint64_t flush(); + void finishStream(); void recordPosition(PositionRecorder* recorder) const; }; diff --git a/c++/src/sargs/ExpressionTree.cc b/c++/src/sargs/ExpressionTree.cc index e49bca4b77..58dd13817d 100644 --- a/c++/src/sargs/ExpressionTree.cc +++ b/c++/src/sargs/ExpressionTree.cc @@ -110,6 +110,9 @@ namespace orc { return result; } case Operator::NOT: + if (children_.size() != 1) { + throw std::invalid_argument("NOT operator must have exactly one child"); + } return !children_.at(0)->evaluate(leaves); case Operator::LEAF: return leaves[leaf_]; @@ -159,6 +162,9 @@ namespace orc { sstream << ')'; break; case Operator::NOT: + if (children_.size() != 1) { + throw std::invalid_argument("NOT operator must have exactly one child"); + } sstream << "(not " << children_.at(0)->toString() << ')'; break; case Operator::LEAF: diff --git a/c++/src/sargs/PredicateLeaf.cc b/c++/src/sargs/PredicateLeaf.cc index d9df1c5d5c..5c77616836 100644 --- a/c++/src/sargs/PredicateLeaf.cc +++ b/c++/src/sargs/PredicateLeaf.cc @@ -701,6 +701,9 @@ namespace orc { } } + // files written by trino may lack of hasnull field. + if (!colStats.has_has_null()) return TruthValue::YES_NO_NULL; + bool allNull = colStats.has_null() && colStats.number_of_values() == 0; if (operator_ == Operator::IS_NULL || ((operator_ == Operator::EQUALS || operator_ == Operator::NULL_SAFE_EQUALS) && diff --git a/c++/test/CMakeLists.txt b/c++/test/CMakeLists.txt index b04055366c..f7328abb32 100644 --- a/c++/test/CMakeLists.txt +++ b/c++/test/CMakeLists.txt @@ -15,14 +15,15 @@ # specific language governing permissions and limitations # under the License. -include_directories( - ${PROJECT_SOURCE_DIR}/c++/src +set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX17_FLAGS} ${WARN_FLAGS}") + +add_library (orc-test-include INTERFACE) +target_include_directories (orc-test-include INTERFACE ${PROJECT_BINARY_DIR}/c++/include ${PROJECT_BINARY_DIR}/c++/src + ${PROJECT_SOURCE_DIR}/c++/src ) -set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX17_FLAGS} ${WARN_FLAGS}") - if(BUILD_ENABLE_AVX512) set(SIMD_TEST_SRCS TestRleVectorDecoder.cc) endif(BUILD_ENABLE_AVX512) @@ -62,6 +63,7 @@ add_executable (orc-test TestTimezone.cc TestType.cc TestWriter.cc + TestCache.cc ${SIMD_TEST_SRCS} ) @@ -73,6 +75,7 @@ target_link_libraries (orc-test orc::zlib orc::gtest orc::gmock + orc-test-include ) add_executable (create-test-files @@ -82,6 +85,7 @@ add_executable (create-test-files target_link_libraries (create-test-files orc orc::protobuf + orc-test-include ) if (TEST_VALGRIND_MEMCHECK) diff --git a/c++/test/MemoryInputStream.hh b/c++/test/MemoryInputStream.hh index e6ef55b6de..31333ae430 100644 --- a/c++/test/MemoryInputStream.hh +++ b/c++/test/MemoryInputStream.hh @@ -22,8 +22,6 @@ #include "io/InputStream.hh" #include "orc/OrcFile.hh" -#include - namespace orc { class MemoryInputStream : public InputStream { public: @@ -44,6 +42,11 @@ namespace orc { memcpy(buf, buffer_ + offset, length); } + std::future readAsync(void* buf, uint64_t length, uint64_t offset) override { + return std::async(std::launch::async, + [this, buf, length, offset] { this->read(buf, length, offset); }); + } + virtual const std::string& getName() const override { return name_; } diff --git a/c++/test/TestByteRle.cc b/c++/test/TestByteRle.cc index a822a61d6b..7717eab387 100644 --- a/c++/test/TestByteRle.cc +++ b/c++/test/TestByteRle.cc @@ -1263,7 +1263,7 @@ namespace orc { MemoryOutputStream memStream(capacity); std::unique_ptr encoder = createBooleanRleEncoder( createCompressor(CompressionKind_ZSTD, &memStream, CompressionStrategy_COMPRESSION, - capacity, blockSize, *getDefaultPool(), nullptr)); + capacity, blockSize, blockSize, *getDefaultPool(), nullptr)); encoder->add(data, numValues, nullptr); encoder->flush(); diff --git a/c++/test/TestCache.cc b/c++/test/TestCache.cc new file mode 100644 index 0000000000..496ba3ec90 --- /dev/null +++ b/c++/test/TestCache.cc @@ -0,0 +1,142 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "MemoryInputStream.hh" +#include "io/Cache.hh" + +#include "wrap/gmock.h" +#include "wrap/gtest-wrapper.h" + +namespace orc { + + TEST(TestReadRangeCombiner, testBasics) { + ReadRangeCombiner combinator{0, 100}; + /// Ranges with partial overlap and identical offsets + std::vector ranges{{0, 15}, {5, 11}, {5, 15}}; + std::vector result = combinator.coalesce(std::move(ranges)); + std::vector expect{{0, 20}}; + ASSERT_EQ(result, expect); + } + + TEST(TestCoalesceReadRanges, testBasics) { + auto check = [](std::vector ranges, std::vector expected) -> void { + const uint64_t holeSizeLimit = 9; + const uint64_t rangeSizeLimit = 99; + auto coalesced = ReadRangeCombiner::coalesceReadRanges(ranges, holeSizeLimit, rangeSizeLimit); + ASSERT_EQ(coalesced, expected); + }; + + check({}, {}); + // Zero sized range that ends up in empty list + check({{110, 0}}, {}); + // Combination on 1 zero sized range and 1 non-zero sized range + check({{110, 10}, {120, 0}}, {{110, 10}}); + // 1 non-zero sized range + check({{110, 10}}, {{110, 10}}); + // No holes + unordered ranges + check({{130, 10}, {110, 10}, {120, 10}}, {{110, 30}}); + // No holes + check({{110, 10}, {120, 10}, {130, 10}}, {{110, 30}}); + // Small holes only + check({{110, 11}, {130, 11}, {150, 11}}, {{110, 51}}); + // Large holes + check({{110, 10}, {130, 10}}, {{110, 10}, {130, 10}}); + check({{110, 11}, {130, 11}, {150, 10}, {170, 11}, {190, 11}}, {{110, 50}, {170, 31}}); + + // With zero-sized ranges + check({{110, 11}, {130, 0}, {130, 11}, {145, 0}, {150, 11}, {200, 0}}, {{110, 51}}); + + // No holes but large ranges + check({{110, 100}, {210, 100}}, {{110, 100}, {210, 100}}); + // Small holes and large range in the middle (*) + check({{110, 10}, {120, 11}, {140, 100}, {240, 11}, {260, 11}}, + {{110, 21}, {140, 100}, {240, 31}}); + // Mid-size ranges that would turn large after coalescing + check({{100, 50}, {150, 50}}, {{100, 50}, {150, 50}}); + check({{100, 30}, {130, 30}, {160, 30}, {190, 30}, {220, 30}}, {{100, 90}, {190, 60}}); + + // Same as (*) but unsorted + check({{140, 100}, {120, 11}, {240, 11}, {110, 10}, {260, 11}}, + {{110, 21}, {140, 100}, {240, 31}}); + + // Completely overlapping ranges should be eliminated + check({{20, 5}, {20, 5}, {21, 2}}, {{20, 5}}); + } + + TEST(TestReadRangeCache, testBasics) { + std::string data = "abcdefghijklmnopqrstuvwxyz"; + + CacheOptions options; + options.holeSizeLimit = 2; + options.rangeSizeLimit = 10; + + auto file = std::make_shared(data.data(), data.size()); + ReadRangeCache cache(file.get(), options, getDefaultPool()); + + cache.cache({{1, 2}, {3, 2}, {8, 2}, {20, 2}, {25, 0}}); + cache.cache({{10, 4}, {14, 0}, {15, 4}}); + + auto assert_slice_equal = [](const BufferSlice& slice, const std::string& expected) { + ASSERT_TRUE(slice.buffer); + ASSERT_EQ(expected, std::string_view(slice.buffer->data() + slice.offset, slice.length)); + }; + + BufferSlice slice; + + slice = cache.read({20, 2}); + assert_slice_equal(slice, "uv"); + + slice = cache.read({1, 2}); + assert_slice_equal(slice, "bc"); + + slice = cache.read({3, 2}); + assert_slice_equal(slice, "de"); + + slice = cache.read({8, 2}); + assert_slice_equal(slice, "ij"); + + slice = cache.read({10, 4}); + assert_slice_equal(slice, "klmn"); + + slice = cache.read({15, 4}); + assert_slice_equal(slice, "pqrs"); + + // Zero-sized + slice = cache.read({14, 0}); + assert_slice_equal(slice, ""); + slice = cache.read({25, 0}); + assert_slice_equal(slice, ""); + + // Non-cached ranges + ASSERT_FALSE(cache.read({20, 3}).buffer); + ASSERT_FALSE(cache.read({19, 3}).buffer); + ASSERT_FALSE(cache.read({0, 3}).buffer); + ASSERT_FALSE(cache.read({25, 2}).buffer); + + // Release cache entries before 10. After that cache entries would be: {10, 9}, {20, 2} + cache.evictEntriesBefore(15); + ASSERT_FALSE(cache.read({1, 2}).buffer); + ASSERT_FALSE(cache.read({8, 2}).buffer); + slice = cache.read({10, 4}); + assert_slice_equal(slice, "klmn"); + slice = cache.read({20, 2}); + assert_slice_equal(slice, "uv"); + } +} // namespace orc diff --git a/c++/test/TestCompression.cc b/c++/test/TestCompression.cc index a77800a3dd..e95a6f0169 100644 --- a/c++/test/TestCompression.cc +++ b/c++/test/TestCompression.cc @@ -42,12 +42,12 @@ namespace orc { } void decompressAndVerify(const MemoryOutputStream& memStream, CompressionKind kind, - const char* data, size_t size, MemoryPool& pool) { + const char* data, size_t size, MemoryPool& pool, uint64_t capacity) { auto inputStream = std::make_unique(memStream.getData(), memStream.getLength()); std::unique_ptr decompressStream = - createDecompressor(kind, std::move(inputStream), 1024, pool, getDefaultReaderMetrics()); + createDecompressor(kind, std::move(inputStream), capacity, pool, getDefaultReaderMetrics()); const char* decompressedBuffer; int decompressedSize; @@ -66,7 +66,7 @@ namespace orc { CompressionStrategy strategy, uint64_t capacity, uint64_t block, MemoryPool& pool, const char* data, size_t dataSize) { std::unique_ptr compressStream = - createCompressor(kind, outStream, strategy, capacity, block, pool, nullptr); + createCompressor(kind, outStream, strategy, capacity, block, block, pool, nullptr); size_t pos = 0; char* compressBuffer; @@ -99,7 +99,7 @@ namespace orc { char testData[] = "hello world!"; compressAndVerify(kind, &memStream, CompressionStrategy_SPEED, capacity, block, *pool, testData, sizeof(testData)); - decompressAndVerify(memStream, kind, testData, sizeof(testData), *pool); + decompressAndVerify(memStream, kind, testData, sizeof(testData), *pool, capacity); } TEST(TestCompression, zlib_compress_original_string) { @@ -117,7 +117,7 @@ namespace orc { char testData[] = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; compressAndVerify(kind, &memStream, CompressionStrategy_SPEED, capacity, block, *pool, testData, sizeof(testData)); - decompressAndVerify(memStream, kind, testData, sizeof(testData), *pool); + decompressAndVerify(memStream, kind, testData, sizeof(testData), *pool, capacity); } TEST(TestCompression, compress_simple_repeated_string) { @@ -138,7 +138,7 @@ namespace orc { } compressAndVerify(kind, &memStream, CompressionStrategy_SPEED, capacity, block, *pool, testData, 170); - decompressAndVerify(memStream, kind, testData, 170, *pool); + decompressAndVerify(memStream, kind, testData, 170, *pool, capacity); } TEST(TestCompression, zlib_compress_two_blocks) { @@ -158,7 +158,7 @@ namespace orc { generateRandomData(testData, dataSize, true); compressAndVerify(kind, &memStream, CompressionStrategy_SPEED, capacity, block, *pool, testData, dataSize); - decompressAndVerify(memStream, kind, testData, dataSize, *pool); + decompressAndVerify(memStream, kind, testData, dataSize, *pool, capacity); delete[] testData; } @@ -179,7 +179,7 @@ namespace orc { generateRandomData(testData, dataSize, false); compressAndVerify(kind, &memStream, CompressionStrategy_SPEED, capacity, block, *pool, testData, dataSize); - decompressAndVerify(memStream, kind, testData, dataSize, *pool); + decompressAndVerify(memStream, kind, testData, dataSize, *pool, capacity); delete[] testData; } @@ -205,7 +205,7 @@ namespace orc { } std::unique_ptr compressStream = createCompressor( - kind, &memStream, CompressionStrategy_SPEED, capacity, block, *pool, nullptr); + kind, &memStream, CompressionStrategy_SPEED, capacity, block, block, *pool, nullptr); EXPECT_TRUE(ps.SerializeToZeroCopyStream(compressStream.get())); compressStream->flush(); @@ -213,8 +213,8 @@ namespace orc { auto inputStream = std::make_unique(memStream.getData(), memStream.getLength()); - std::unique_ptr decompressStream = - createDecompressor(kind, std::move(inputStream), 1024, *pool, getDefaultReaderMetrics()); + std::unique_ptr decompressStream = createDecompressor( + kind, std::move(inputStream), capacity, *pool, getDefaultReaderMetrics()); proto::PostScript ps2; ps2.ParseFromZeroCopyStream(decompressStream.get()); @@ -312,7 +312,7 @@ namespace orc { uint64_t batchSize = 1024, blockSize = 256; AppendOnlyBufferedStream outStream(createCompressor( - kind, &memStream, strategy, DEFAULT_MEM_STREAM_SIZE, blockSize, *pool, nullptr)); + kind, &memStream, strategy, DEFAULT_MEM_STREAM_SIZE, blockSize, blockSize, *pool, nullptr)); // write 3 batches of data and record positions between every batch size_t row = 0; @@ -335,7 +335,7 @@ namespace orc { auto inputStream = std::make_unique(memStream.getData(), memStream.getLength()); std::unique_ptr decompressStream = createDecompressor( - kind, std::move(inputStream), blockSize, *pool, getDefaultReaderMetrics()); + kind, std::move(inputStream), DEFAULT_MEM_STREAM_SIZE, *pool, getDefaultReaderMetrics()); // prepare positions to seek to EXPECT_EQ(rowIndexEntry1.positions_size(), rowIndexEntry2.positions_size()); diff --git a/c++/test/TestConvertColumnReader.cc b/c++/test/TestConvertColumnReader.cc index 83798289db..6096fe4573 100644 --- a/c++/test/TestConvertColumnReader.cc +++ b/c++/test/TestConvertColumnReader.cc @@ -27,6 +27,7 @@ #include "ConvertColumnReader.hh" #include "MemoryInputStream.hh" #include "MemoryOutputStream.hh" +#include namespace orc { @@ -650,6 +651,10 @@ namespace orc { auto& readC2 = dynamic_cast(*readStructBatch.fields[1]); auto& readC3 = dynamic_cast(*readStructBatch.fields[2]); auto& readC4 = dynamic_cast(*readStructBatch.fields[3]); + EXPECT_TRUE(9 == readC1.precision && 5 == readC1.scale); + EXPECT_TRUE(20 == readC2.precision && 5 == readC2.scale); + EXPECT_TRUE(10 == readC3.precision && 3 == readC3.scale); + EXPECT_TRUE(19 == readC4.precision && 3 == readC4.scale); EXPECT_EQ(TEST_CASES, readBatch->numElements); for (int i = 0; i < TEST_CASES / 2; i++) { size_t idx = static_cast(i); @@ -815,4 +820,411 @@ namespace orc { } } + TEST(ConvertColumnReader, TestConvertStringVariantToNumeric) { + constexpr int DEFAULT_MEM_STREAM_SIZE = 10 * 1024 * 1024; + constexpr int TEST_CASES = 6; + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); + std::unique_ptr fileType( + Type::buildTypeFromString("struct")); + std::shared_ptr readType(Type::buildTypeFromString("struct")); + WriterOptions options; + auto writer = createWriter(*fileType, &memStream, options); + auto batch = writer->createRowBatch(TEST_CASES); + auto structBatch = dynamic_cast(batch.get()); + auto& c1 = dynamic_cast(*structBatch->fields[0]); + auto& c2 = dynamic_cast(*structBatch->fields[1]); + auto& c3 = dynamic_cast(*structBatch->fields[2]); + std::vector raw1{"", "123456", "0", "-1234567890", "999999999999999999999999", + "error"}; + std::vector raw2{"", "123456", "0", "-1234567890", "999999999999999999999999", + "error"}; + std::vector raw3{ + "", "123456", "-0.0", "-123456789.0123", "1000000000000000000000000000000000000000", + "error"}; + + c1.notNull[0] = c2.notNull[0] = c3.notNull[0] = false; + for (int i = 1; i < TEST_CASES; i++) { + c1.data[i] = raw1[i].data(); + c1.length[i] = raw1[i].length(); + c1.notNull[i] = true; + + c2.data[i] = raw2[i].data(); + c2.length[i] = raw2[i].length(); + c2.notNull[i] = true; + + c3.data[i] = raw3[i].data(); + c3.length[i] = raw3[i].length(); + c3.notNull[i] = true; + } + + structBatch->numElements = c1.numElements = c2.numElements = c3.numElements = TEST_CASES; + structBatch->hasNulls = c1.hasNulls = c2.hasNulls = c3.hasNulls = true; + writer->add(*batch); + writer->close(); + auto inStream = std::make_unique(memStream.getData(), memStream.getLength()); + auto pool = getDefaultPool(); + auto reader = createReader(*pool, std::move(inStream)); + RowReaderOptions rowReaderOptions; + rowReaderOptions.setUseTightNumericVector(true); + rowReaderOptions.setReadType(readType); + auto rowReader = reader->createRowReader(rowReaderOptions); + auto readBatch = rowReader->createRowBatch(TEST_CASES); + EXPECT_EQ(true, rowReader->next(*readBatch)); + + auto& readSturctBatch = dynamic_cast(*readBatch); + auto& readC1 = dynamic_cast(*readSturctBatch.fields[0]); + auto& readC2 = dynamic_cast(*readSturctBatch.fields[1]); + auto& readC3 = dynamic_cast(*readSturctBatch.fields[2]); + + EXPECT_FALSE(readC1.notNull[0]); + EXPECT_FALSE(readC2.notNull[0]); + EXPECT_FALSE(readC3.notNull[0]); + + for (int i = 1; i < 4; i++) { + EXPECT_TRUE(readC1.notNull[i]); + EXPECT_TRUE(readC2.notNull[i]); + EXPECT_TRUE(readC3.notNull[i]); + } + + for (int i = 4; i <= 5; i++) { + EXPECT_FALSE(readC1.notNull[i]) << i; + EXPECT_FALSE(readC2.notNull[i]) << i; + EXPECT_FALSE(readC3.notNull[i]) << i; + } + + EXPECT_EQ(readC1.data[1], 1); + EXPECT_EQ(readC2.data[1], 123456); + EXPECT_FLOAT_EQ(readC3.data[1], 123456); + + EXPECT_EQ(readC1.data[2], 0); + EXPECT_EQ(readC2.data[2], 0); + EXPECT_FLOAT_EQ(readC3.data[2], -0.0); + + EXPECT_EQ(readC1.data[3], 1); + EXPECT_EQ(readC2.data[3], -1234567890); + EXPECT_FLOAT_EQ(readC3.data[3], -123456789.0123); + } + + TEST(ConvertColumnReader, TestConvertStringVariant) { + constexpr int DEFAULT_MEM_STREAM_SIZE = 10 * 1024 * 1024; + constexpr int TEST_CASES = 4; + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); + std::unique_ptr fileType( + Type::buildTypeFromString("struct")); + std::shared_ptr readType( + Type::buildTypeFromString("struct")); + WriterOptions options; + auto writer = createWriter(*fileType, &memStream, options); + auto batch = writer->createRowBatch(TEST_CASES); + auto structBatch = dynamic_cast(batch.get()); + auto& c1 = dynamic_cast(*structBatch->fields[0]); + auto& c2 = dynamic_cast(*structBatch->fields[1]); + auto& c3 = dynamic_cast(*structBatch->fields[2]); + + std::vector raw1{"", "12345", "1", "1234"}; + std::vector raw2{"", "12345", "1", "1234"}; + std::vector raw3{"", "12345", "1", "1234"}; + + c1.notNull[0] = c2.notNull[0] = c3.notNull[0] = false; + for (int i = 1; i < TEST_CASES; i++) { + c1.data[i] = raw1[i].data(); + c1.length[i] = raw1[i].length(); + c1.notNull[i] = true; + + c2.data[i] = raw2[i].data(); + c2.length[i] = raw2[i].length(); + c2.notNull[i] = true; + + c3.data[i] = raw3[i].data(); + c3.length[i] = raw3[i].length(); + c3.notNull[i] = true; + } + structBatch->numElements = c1.numElements = c2.numElements = c3.numElements = TEST_CASES; + structBatch->hasNulls = c1.hasNulls = c2.hasNulls = c3.hasNulls = true; + writer->add(*batch); + writer->close(); + auto inStream = std::make_unique(memStream.getData(), memStream.getLength()); + auto pool = getDefaultPool(); + auto reader = createReader(*pool, std::move(inStream)); + RowReaderOptions rowReaderOptions; + rowReaderOptions.setUseTightNumericVector(true); + rowReaderOptions.setReadType(readType); + auto rowReader = reader->createRowReader(rowReaderOptions); + auto readBatch = rowReader->createRowBatch(TEST_CASES); + EXPECT_EQ(true, rowReader->next(*readBatch)); + + auto& readSturctBatch = dynamic_cast(*readBatch); + auto& readC1 = dynamic_cast(*readSturctBatch.fields[0]); + auto& readC2 = dynamic_cast(*readSturctBatch.fields[1]); + auto& readC3 = dynamic_cast(*readSturctBatch.fields[2]); + + EXPECT_FALSE(readC1.notNull[0]); + EXPECT_FALSE(readC2.notNull[0]); + EXPECT_FALSE(readC3.notNull[0]); + + for (int i = 1; i < TEST_CASES; i++) { + EXPECT_TRUE(readC1.notNull[i]); + EXPECT_TRUE(readC2.notNull[i]); + EXPECT_TRUE(readC3.notNull[i]); + } + + EXPECT_EQ(std::string(readC1.data[1], readC1.length[1]), "12345"); + EXPECT_EQ(std::string(readC2.data[1], readC2.length[1]), "1234"); + EXPECT_EQ(std::string(readC3.data[1], readC3.length[1]), "1234"); + + EXPECT_EQ(std::string(readC1.data[2], readC1.length[2]), "1 "); + EXPECT_EQ(std::string(readC2.data[2], readC2.length[2]), "1 "); + EXPECT_EQ(std::string(readC3.data[2], readC3.length[2]), "1"); + + EXPECT_EQ(std::string(readC1.data[3], readC1.length[3]), "1234 "); + EXPECT_EQ(std::string(readC2.data[3], readC2.length[3]), "1234"); + EXPECT_EQ(std::string(readC3.data[3], readC3.length[3]), "1234"); + } + + // Returns year/month/day triple in civil calendar + // Preconditions: z is number of days since 1970-01-01 and is in the range: + // [numeric_limits::min(), numeric_limits::max()-719468]. + template + constexpr std::tuple civil_from_days(Int z) noexcept { + static_assert(std::numeric_limits::digits >= 18, + "This algorithm has not been ported to a 16 bit unsigned integer"); + static_assert(std::numeric_limits::digits >= 20, + "This algorithm has not been ported to a 16 bit signed integer"); + z += 719468; + const Int era = (z >= 0 ? z : z - 146096) / 146097; + const unsigned doe = static_cast(z - era * 146097); // [0, 146096] + const unsigned yoe = (doe - doe / 1460 + doe / 36524 - doe / 146096) / 365; // [0, 399] + const Int y = static_cast(yoe) + era * 400; + const unsigned doy = doe - (365 * yoe + yoe / 4 - yoe / 100); // [0, 365] + const unsigned mp = (5 * doy + 2) / 153; // [0, 11] + const unsigned d = doy - (153 * mp + 2) / 5 + 1; // [1, 31] + const unsigned m = mp < 10 ? mp + 3 : mp - 9; // [1, 12] + return std::tuple(y + (m <= 2), m, d); + } + + static std::string timestampToString(int64_t seconds, int64_t nanos, + const std::string& zoneName) { + auto& timezone = getTimezoneByName(zoneName); + seconds = timezone.convertToUTC(seconds); + time_t t = static_cast(seconds); + char buffer[100]; + constexpr auto SECOND_IN_DAY = 3600 * 24; + auto day = t < 0 ? (t - SECOND_IN_DAY + 1) / SECOND_IN_DAY : t / SECOND_IN_DAY; + + auto [y, m, d] = civil_from_days(day); + auto second_in_day = t % (3600 * 24); + if (second_in_day < 0) { + second_in_day += 3600 * 24; + } + auto h = second_in_day % (3600 * 24) / 3600; + auto min = second_in_day % 3600 / 60; + auto s = second_in_day % 60; + std::snprintf(buffer, sizeof(buffer), "%04d-%02d-%02d %02ld:%02ld:%02ld", y, m, d, h, min, s); + std::string result(buffer); + if (nanos) { + while (nanos % 10 == 0) nanos /= 10; + result = result + "." + std::to_string(nanos); + } + result = result + " " + zoneName; + return result; + } + + TEST(ConvertColumnReader, TestConvertStringVariantToTimestamp) { + constexpr int DEFAULT_MEM_STREAM_SIZE = 10 * 1024 * 1024; + constexpr int TEST_CASES = 1024; + const std::string writerTimezone = "America/New_York"; + const std::string readerTimezone = "Australia/Sydney"; + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); + std::unique_ptr fileType(Type::buildTypeFromString("struct")); + std::shared_ptr readType( + Type::buildTypeFromString("struct")); + WriterOptions options; + options.setTimezoneName(writerTimezone); + auto writer = createWriter(*fileType, &memStream, options); + auto batch = writer->createRowBatch(TEST_CASES); + auto structBatch = dynamic_cast(batch.get()); + auto& c1 = dynamic_cast(*structBatch->fields[0]); + auto& c2 = dynamic_cast(*structBatch->fields[1]); + + std::vector raw1, raw2; + raw1.reserve(TEST_CASES * 3); + raw2.reserve(TEST_CASES * 3); + std::vector ts1, ts2; + + for (int i = 0; i < TEST_CASES; i++) { + char buff[100]; + auto size = ::snprintf(buff, sizeof(buff), "%04d-%02d-27 12:34:56.789", 1960 + (i / 12), + (i % 12) + 1); + raw1.emplace_back(buff, size); + raw2.push_back(raw1.back() + " " + writerTimezone); + c1.data[i] = const_cast(raw1.back().c_str()); + c1.length[i] = raw1.back().length(); + c2.data[i] = const_cast(raw2.back().c_str()); + c2.length[i] = raw2.back().length(); + } + structBatch->numElements = c1.numElements = c2.numElements = TEST_CASES; + structBatch->hasNulls = c1.hasNulls = c2.hasNulls = false; + writer->add(*batch); + + for (int i = 0; i < TEST_CASES; i++) { + char buff[100]; + auto size = + ::snprintf(buff, sizeof(buff), "%04d-%02d-27 12:34:56", 1960 + (i / 12), (i % 12) + 1); + raw1.emplace_back(buff, size); + raw2.push_back(raw1.back() + " " + writerTimezone); + c1.data[i] = const_cast(raw1.back().c_str()); + c1.length[i] = raw1.back().length(); + c2.data[i] = const_cast(raw2.back().c_str()); + c2.length[i] = raw2.back().length(); + } + structBatch->numElements = c1.numElements = c2.numElements = TEST_CASES; + structBatch->hasNulls = c1.hasNulls = c2.hasNulls = false; + writer->add(*batch); + + { + raw1.push_back("2024?11-14 00:01:02"); + raw2.push_back("2024-01-02 03:04:05.678 tz/error"); + c1.data[0] = const_cast(raw1.back().c_str()); + c1.length[0] = raw1.back().length(); + c2.data[0] = const_cast(raw2.back().c_str()); + c2.length[0] = raw2.back().length(); + + c1.notNull[1] = false; + c2.notNull[1] = false; + + raw1.push_back("2024-12-14 00:01:02.-1"); + raw2.push_back("2024-01-02 03:04:05.678"); + c1.data[2] = const_cast(raw1.back().c_str()); + c1.length[2] = raw1.back().length(); + c2.data[2] = const_cast(raw2.back().c_str()); + c2.length[2] = raw2.back().length(); + } + structBatch->numElements = c1.numElements = c2.numElements = 3; + structBatch->hasNulls = c1.hasNulls = c2.hasNulls = true; + writer->add(*batch); + + writer->close(); + + auto inStream = std::make_unique(memStream.getData(), memStream.getLength()); + auto pool = getDefaultPool(); + auto reader = createReader(*pool, std::move(inStream)); + RowReaderOptions rowReaderOptions; + rowReaderOptions.setUseTightNumericVector(true); + rowReaderOptions.setReadType(readType); + rowReaderOptions.setTimezoneName(readerTimezone); + rowReaderOptions.throwOnSchemaEvolutionOverflow(true); + auto rowReader = reader->createRowReader(rowReaderOptions); + auto readBatch = rowReader->createRowBatch(TEST_CASES * 2); + EXPECT_EQ(true, rowReader->next(*readBatch)); + + auto& readSturctBatch = dynamic_cast(*readBatch); + auto& readC1 = dynamic_cast(*readSturctBatch.fields[0]); + auto& readC2 = dynamic_cast(*readSturctBatch.fields[1]); + + for (int i = 0; i < TEST_CASES * 2; i++) { + EXPECT_TRUE(readC1.notNull[i]); + EXPECT_TRUE(readC2.notNull[i]); + EXPECT_EQ(raw1[i] + " " + readerTimezone, + timestampToString(readC1.data[i], readC1.nanoseconds[i], readerTimezone)); + EXPECT_EQ(raw2[i], timestampToString(readC2.data[i], readC2.nanoseconds[i], writerTimezone)); + } + + rowReaderOptions.throwOnSchemaEvolutionOverflow(false); + rowReader = reader->createRowReader(rowReaderOptions); + EXPECT_EQ(true, rowReader->next(*readBatch)); + EXPECT_EQ(true, rowReader->next(*readBatch)); + EXPECT_EQ(3, readBatch->numElements); + EXPECT_FALSE(readC1.notNull[0]); + EXPECT_FALSE(readC2.notNull[0]); + EXPECT_FALSE(readC1.notNull[1]); + EXPECT_FALSE(readC2.notNull[1]); + EXPECT_FALSE(readC1.notNull[2]); + EXPECT_FALSE(readC2.notNull[2]); + } + + TEST(ConvertColumnReader, TestConvertStringVariantToDecimal) { + constexpr int DEFAULT_MEM_STREAM_SIZE = 10 * 1024 * 1024; + constexpr int TEST_CASES = 1024; + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); + std::unique_ptr fileType(Type::buildTypeFromString("struct")); + std::shared_ptr readType( + Type::buildTypeFromString("struct")); + WriterOptions options; + auto writer = createWriter(*fileType, &memStream, options); + auto batch = writer->createRowBatch(TEST_CASES); + auto structBatch = dynamic_cast(batch.get()); + auto& c1 = dynamic_cast(*structBatch->fields[0]); + auto& c2 = dynamic_cast(*structBatch->fields[1]); + + // + std::vector> rawDataAndExpected; + + rawDataAndExpected = { + /* 0 */ {"123456789012345678901234567890123456789", false, false, int64_t(), Int128()}, + /* 1 */ {"123456789012345678901234567890.1234567890", false, false, int64_t(), Int128()}, + /* 2 */ {"-123456789012345678901234567890.1234567890", false, false, int64_t(), Int128()}, + /* 3 */ {"-foo.bar", false, false, int64_t(), Int128()}, + /* 4 */ {"-foo.123", false, false, int64_t(), Int128()}, + /* 5 */ {"-123.foo", false, false, int64_t(), Int128()}, + /* 6 */ {"-123foo.123", false, false, int64_t(), Int128()}, + /* 7 */ {"-123.123foo", false, false, int64_t(), Int128()}, + /* 8 */ {"-.", false, false, int64_t(), Int128()}, + /* 9 */ {"-", false, false, int64_t(), Int128()}, + /* 10 */ {".", false, false, int64_t(), Int128()}, + /* 11 */ {"", false, false, int64_t(), Int128()}, + /* 12 */ {".12345", false, false, int64_t(), Int128()}, + /* 13 */ {"12345.", false, false, int64_t(), Int128()}, + /* 14 */ {"-1", true, true, -100000LL, Int128("-10000000000")}, + /* 15 */ {"-1.0", true, true, -100000LL, Int128("-10000000000")}, + /* 16 */ {"1", true, true, 100000, Int128("10000000000")}, + /* 17 */ {"1.0", true, true, 100000, Int128("10000000000")}, + /* 18 */ {"12345", true, true, 1234500000, Int128("123450000000000")}, + /* 19 */ {"12345.12345", true, true, 1234512345LL, Int128("123451234500000")}, + /* 20 */ {"-12345.12345", true, true, -1234512345LL, Int128("-123451234500000")}, + /* 21 */ {"1234567890", false, true, int64_t(), Int128("12345678900000000000")}, + /* 22 */ {"-1234567890", false, true, int64_t(), Int128("-12345678900000000000")}, + /* 23 */ {"1234567890.123", false, true, int64_t(), Int128("12345678901230000000")}, + /* 24 */ {"-1234567890.1234567", false, true, int64_t(), Int128("-12345678901234567000")}, + /* 25 */ {"1234567890123.12345", false, true, int64_t(), Int128("12345678901231234500000")}, + /* 26 */ + {"-1234567890123.12345678901", false, true, int64_t(), Int128("-12345678901231234567890")}}; + for (int i = 0; i < rawDataAndExpected.size(); i++) { + c1.data[i] = c2.data[i] = const_cast(std::get<0>(rawDataAndExpected[i]).c_str()); + c1.length[i] = c2.length[i] = std::get<0>(rawDataAndExpected[i]).length(); + } + + structBatch->numElements = c1.numElements = c2.numElements = rawDataAndExpected.size(); + structBatch->hasNulls = c1.hasNulls = c2.hasNulls = false; + writer->add(*batch); + writer->close(); + + auto inStream = std::make_unique(memStream.getData(), memStream.getLength()); + auto pool = getDefaultPool(); + auto reader = createReader(*pool, std::move(inStream)); + RowReaderOptions rowReaderOptions; + rowReaderOptions.setUseTightNumericVector(true); + rowReaderOptions.setReadType(readType); + auto rowReader = reader->createRowReader(rowReaderOptions); + auto readBatch = rowReader->createRowBatch(TEST_CASES); + EXPECT_EQ(true, rowReader->next(*readBatch)); + + auto& readSturctBatch = dynamic_cast(*readBatch); + auto& readC1 = dynamic_cast(*readSturctBatch.fields[0]); + auto& readC2 = dynamic_cast(*readSturctBatch.fields[1]); + EXPECT_EQ(readBatch->numElements, rawDataAndExpected.size()); + + for (int i = 0; i < readBatch->numElements; i++) { + bool expectedNotNull1 = std::get<1>(rawDataAndExpected[i]); + bool expectedNotNull2 = std::get<2>(rawDataAndExpected[i]); + EXPECT_EQ(expectedNotNull1, readC1.notNull[i]) << i; + EXPECT_EQ(expectedNotNull2, readC2.notNull[i]) << i; + if (expectedNotNull1) { + EXPECT_EQ(std::get<3>(rawDataAndExpected[i]), readC1.values[i]) << i; + } + if (expectedNotNull2) { + EXPECT_EQ(std::get<4>(rawDataAndExpected[i]), readC2.values[i]) << i; + } + } + } + } // namespace orc diff --git a/c++/test/TestDecompression.cc b/c++/test/TestDecompression.cc index dc6caeda0e..125c5e85a4 100644 --- a/c++/test/TestDecompression.cc +++ b/c++/test/TestDecompression.cc @@ -395,6 +395,26 @@ namespace orc { ASSERT_TRUE(!result->Next(&ptr, &length)); } + TEST_F(TestDecompression, testLzoOverflow) { + const unsigned char bad_lzo_data[] = {// Header: compressedSize = 12, original = false + 0x18, 0x00, 0x00, + + // LZO body: token and literal length extension + 0x00, // token: extended literal length + 0xFF, // extension byte 1 + + // Literal data: only 10 bytes far less than 273 + 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A'}; + + std::unique_ptr result = createDecompressor( + CompressionKind_LZO, + std::make_unique(bad_lzo_data, ARRAY_SIZE(bad_lzo_data)), + 128 * 1024, *getDefaultPool(), getDefaultReaderMetrics()); + const void* ptr; + int length; + EXPECT_THROW(result->Next(&ptr, &length), ParseError); + } + TEST_F(TestDecompression, testLz4Empty) { const unsigned char buffer[] = {0}; std::unique_ptr result = createDecompressor( @@ -545,7 +565,7 @@ namespace orc { *getDefaultPool(), getDefaultReaderMetrics()); const void* ptr; int length; - ASSERT_THROW(result->BackUp(20), std::logic_error); + ASSERT_THROW(result->BackUp(20), CompressionError); ASSERT_EQ(true, result->Next(&ptr, &length)); ASSERT_EQ(30, length); for (int i = 0; i < 10; ++i) { @@ -554,7 +574,7 @@ namespace orc { } } result->BackUp(10); - ASSERT_THROW(result->BackUp(2), std::logic_error); + ASSERT_THROW(result->BackUp(2), CompressionError); ASSERT_EQ(true, result->Next(&ptr, &length)); ASSERT_EQ(10, length); for (int i = 0; i < 10; ++i) { diff --git a/c++/test/TestDictionaryEncoding.cc b/c++/test/TestDictionaryEncoding.cc index f3dcaa0067..40c1b1a605 100644 --- a/c++/test/TestDictionaryEncoding.cc +++ b/c++/test/TestDictionaryEncoding.cc @@ -25,6 +25,7 @@ #include "wrap/gtest-wrapper.h" #include +#include #include namespace orc { @@ -53,6 +54,7 @@ namespace orc { WriterOptions options; options.setStripeSize(1024); + options.setMemoryBlockSize(64); options.setCompressionBlockSize(1024); options.setCompression(CompressionKind_ZLIB); options.setMemoryPool(pool); @@ -109,6 +111,7 @@ namespace orc { WriterOptions options; options.setStripeSize(1024); + options.setMemoryBlockSize(64); options.setCompressionBlockSize(1024); options.setCompression(CompressionKind_ZLIB); options.setMemoryPool(pool); @@ -171,6 +174,7 @@ namespace orc { WriterOptions options; options.setStripeSize(1024); options.setCompressionBlockSize(1024); + options.setMemoryBlockSize(64); options.setCompression(CompressionKind_ZLIB); options.setMemoryPool(pool); options.setDictionaryKeySizeThreshold(threshold); @@ -233,6 +237,7 @@ namespace orc { WriterOptions options; options.setStripeSize(1024); + options.setMemoryBlockSize(64); options.setCompressionBlockSize(1024); options.setCompression(CompressionKind_ZLIB); options.setMemoryPool(pool); @@ -302,7 +307,8 @@ namespace orc { WriterOptions options; options.setStripeSize(1); - options.setCompressionBlockSize(1024); + options.setMemoryBlockSize(1024); + options.setCompressionBlockSize(2 * 1024); options.setCompression(CompressionKind_ZLIB); options.setMemoryPool(pool); options.setDictionaryKeySizeThreshold(threshold); @@ -429,4 +435,57 @@ namespace orc { testDictionaryMultipleStripes(DICT_THRESHOLD, false); testDictionaryMultipleStripes(FALLBACK_THRESHOLD, false); } + + TEST(DictionaryEncoding, decodeDictionary) { + size_t rowCount = 8192; + size_t dictionarySize = 100; + auto* memoryPool = getDefaultPool(); + + auto encodedStringBatch = std::make_shared(rowCount, *memoryPool); + EXPECT_FALSE(encodedStringBatch->dictionaryDecoded); + encodedStringBatch->numElements = rowCount; + encodedStringBatch->hasNulls = true; + encodedStringBatch->isEncoded = true; + encodedStringBatch->dictionary = std::make_shared(*memoryPool); + + auto& dictionary = *encodedStringBatch->dictionary; + dictionary.dictionaryBlob.resize(3 * dictionarySize); + dictionary.dictionaryOffset.resize(dictionarySize + 1); + dictionary.dictionaryOffset[0] = 0; + for (uint64_t i = 0; i < dictionarySize; ++i) { + std::ostringstream oss; + oss << std::setw(3) << std::setfill('0') << i; + + auto str = oss.str(); + memcpy(&dictionary.dictionaryBlob[3 * i], str.data(), str.size()); + dictionary.dictionaryOffset[i + 1] = 3 * (i + 1); + } + + for (uint64_t i = 0; i < rowCount; ++i) { + if (i % 10 == 0) { + encodedStringBatch->notNull[i] = 0; + encodedStringBatch->index[i] = 0; + } else { + encodedStringBatch->notNull[i] = 1; + encodedStringBatch->index[i] = i % dictionarySize; + } + } + + encodedStringBatch->decodeDictionary(); + EXPECT_TRUE(encodedStringBatch->dictionaryDecoded); + EXPECT_EQ(0, encodedStringBatch->blob.size()); + + for (uint64_t i = 0; i < rowCount; ++i) { + if (encodedStringBatch->notNull[i]) { + auto index = encodedStringBatch->index[i]; + char* buf = nullptr; + int64_t buf_size = 0; + dictionary.getValueByIndex(index, buf, buf_size); + + EXPECT_EQ(buf, encodedStringBatch->data[i]); + EXPECT_EQ(buf_size, encodedStringBatch->length[i]); + } + } + } + } // namespace orc diff --git a/c++/test/TestInt128.cc b/c++/test/TestInt128.cc index 54dcff4567..be5b65b3a7 100644 --- a/c++/test/TestInt128.cc +++ b/c++/test/TestInt128.cc @@ -555,6 +555,11 @@ namespace orc { num = Int128("-12345678901122334455667788990011122233"); EXPECT_EQ("-12345678901122334455667788990011122233", num.toString()); + + num = Int128::maximumValue(); + EXPECT_EQ("170141183460469231731687303715884105727", num.toString()); + num = Int128::minimumValue(); + EXPECT_EQ("-170141183460469231731687303715884105728", num.toString()); } TEST(Int128, testToDecimalString) { diff --git a/c++/test/TestPredicateLeaf.cc b/c++/test/TestPredicateLeaf.cc index 2703776e39..3946123ec5 100644 --- a/c++/test/TestPredicateLeaf.cc +++ b/c++/test/TestPredicateLeaf.cc @@ -168,6 +168,12 @@ namespace orc { return colStats; } + static proto::ColumnStatistics createIncompleteNullStats() { + proto::ColumnStatistics colStats; + colStats.set_number_of_values(0); + return colStats; + } + static TruthValue evaluate(const PredicateLeaf& pred, const proto::ColumnStatistics& pbStats, const BloomFilter* bf = nullptr) { return pred.evaluate(WriterVersion_ORC_135, pbStats, bf); @@ -663,4 +669,10 @@ namespace orc { evaluate(pred8, createTimestampStats(2114380800, 1109000, 2114380800, 6789100))); } + TEST(TestPredicateLeaf, testLackOfSataistics) { + PredicateLeaf pred(PredicateLeaf::Operator::IS_NULL, PredicateDataType::STRING, 1, {}); + EXPECT_EQ(TruthValue::YES_NO, evaluate(pred, createStringStats("c", "d", true))); + EXPECT_EQ(TruthValue::YES_NO_NULL, evaluate(pred, createIncompleteNullStats())); + } + } // namespace orc diff --git a/c++/test/TestPredicatePushdown.cc b/c++/test/TestPredicatePushdown.cc index e949fc2898..5c8ed14e73 100644 --- a/c++/test/TestPredicatePushdown.cc +++ b/c++/test/TestPredicatePushdown.cc @@ -33,6 +33,7 @@ namespace orc { WriterOptions options; options.setStripeSize(1024 * 1024) .setCompressionBlockSize(1024) + .setMemoryBlockSize(64) .setCompression(CompressionKind_NONE) .setMemoryPool(pool) .setRowIndexStride(rowIndexStride); @@ -510,6 +511,7 @@ namespace orc { WriterOptions options; options.setStripeSize(1) .setCompressionBlockSize(1024) + .setMemoryBlockSize(64) .setCompression(CompressionKind_NONE) .setMemoryPool(pool) .setRowIndexStride(1000); diff --git a/c++/test/TestReader.cc b/c++/test/TestReader.cc index f709f693f1..f9df6edc92 100644 --- a/c++/test/TestReader.cc +++ b/c++/test/TestReader.cc @@ -155,7 +155,10 @@ namespace orc { ASSERT_THAT(rowReader->getSelectedColumns(), ElementsAreArray(expected)); } - std::unique_ptr createNestedListMemReader(MemoryOutputStream& memStream) { + std::unique_ptr createNestedListMemReader(MemoryOutputStream& memStream, + const std::vector& stripesToPrefetch, + const std::list& columnsToPrefetch, + bool prefetchTwice) { MemoryPool* pool = getDefaultPool(); auto type = std::unique_ptr( @@ -166,6 +169,7 @@ namespace orc { WriterOptions options; options.setStripeSize(1024 * 1024) .setCompressionBlockSize(1024) + .setMemoryBlockSize(64) .setCompression(CompressionKind_NONE) .setMemoryPool(pool) .setRowIndexStride(1000); @@ -217,20 +221,43 @@ namespace orc { auto inStream = std::make_unique(memStream.getData(), memStream.getLength()); ReaderOptions readerOptions; readerOptions.setMemoryPool(*pool); - return createReader(std::move(inStream), readerOptions); + auto reader = createReader(std::move(inStream), readerOptions); + + reader->preBuffer(stripesToPrefetch, columnsToPrefetch); + if (prefetchTwice) { + reader->preBuffer(stripesToPrefetch, columnsToPrefetch); + } + + return reader; } - TEST(TestReadIntent, testListAll) { + class TestReadIntentFromNestedList + : public ::testing::TestWithParam< + std::tuple, std::list, bool>> {}; + + TEST_P(TestReadIntentFromNestedList, testListAll) { + const auto& params = GetParam(); + const std::vector& stripesToPrefetch = std::get<0>(params); + const std::list& columnsToPrefetch = std::get<1>(params); + bool prefetchTwice = std::get<2>(params); + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); - std::unique_ptr reader = createNestedListMemReader(memStream); + std::unique_ptr reader = + createNestedListMemReader(memStream, stripesToPrefetch, columnsToPrefetch, prefetchTwice); // select all of int_array. verifySelection(reader, {{1, ReadIntent_ALL}}, {0, 1, 2}); } - TEST(TestReadIntent, testListOffsets) { + TEST_P(TestReadIntentFromNestedList, testListOffsets) { + const auto& params = GetParam(); + const std::vector& stripesToPrefetch = std::get<0>(params); + const std::list& columnsToPrefetch = std::get<1>(params); + bool prefetchTwice = std::get<2>(params); + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); - std::unique_ptr reader = createNestedListMemReader(memStream); + std::unique_ptr reader = + createNestedListMemReader(memStream, stripesToPrefetch, columnsToPrefetch, prefetchTwice); // select only the offsets of int_array. verifySelection(reader, {{1, ReadIntent_OFFSETS}}, {0, 1}); @@ -243,26 +270,44 @@ namespace orc { verifySelection(reader, {{3, ReadIntent_OFFSETS}, {5, ReadIntent_OFFSETS}}, {0, 3, 4, 5}); } - TEST(TestReadIntent, testListAllAndOffsets) { + TEST_P(TestReadIntentFromNestedList, testListAllAndOffsets) { + const auto& params = GetParam(); + const std::vector& stripesToPrefetch = std::get<0>(params); + const std::list& columnsToPrefetch = std::get<1>(params); + bool prefetchTwice = std::get<2>(params); + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); - std::unique_ptr reader = createNestedListMemReader(memStream); + std::unique_ptr reader = + createNestedListMemReader(memStream, stripesToPrefetch, columnsToPrefetch, prefetchTwice); // select all of int_array and only the outermost offsets of int_array_array_array. verifySelection(reader, {{1, ReadIntent_ALL}, {3, ReadIntent_OFFSETS}}, {0, 1, 2, 3}); } - TEST(TestReadIntent, testListConflictingIntent) { + TEST_P(TestReadIntentFromNestedList, testListConflictingIntent) { + const auto& params = GetParam(); + const std::vector& stripesToPrefetch = std::get<0>(params); + const std::list& columnsToPrefetch = std::get<1>(params); + bool prefetchTwice = std::get<2>(params); + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); - std::unique_ptr reader = createNestedListMemReader(memStream); + std::unique_ptr reader = + createNestedListMemReader(memStream, stripesToPrefetch, columnsToPrefetch, prefetchTwice); // test conflicting ReadIntent on nested list. verifySelection(reader, {{3, ReadIntent_OFFSETS}, {5, ReadIntent_ALL}}, {0, 3, 4, 5, 6}); verifySelection(reader, {{3, ReadIntent_ALL}, {5, ReadIntent_OFFSETS}}, {0, 3, 4, 5, 6}); } - TEST(TestReadIntent, testRowBatchContent) { + TEST_P(TestReadIntentFromNestedList, testRowBatchContent) { + const auto& params = GetParam(); + const std::vector& stripesToPrefetch = std::get<0>(params); + const std::list& columnsToPrefetch = std::get<1>(params); + bool prefetchTwice = std::get<2>(params); + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); - std::unique_ptr reader = createNestedListMemReader(memStream); + std::unique_ptr reader = + createNestedListMemReader(memStream, stripesToPrefetch, columnsToPrefetch, prefetchTwice); // select all of int_array and only the offsets of int_array_array. RowReaderOptions::IdReadIntentMap idReadIntentMap = {{1, ReadIntent_ALL}, @@ -298,7 +343,24 @@ namespace orc { EXPECT_EQ(nullptr, intArrayArrayArrayBatch.elements); } - std::unique_ptr createNestedMapMemReader(MemoryOutputStream& memStream) { + INSTANTIATE_TEST_SUITE_P( + TestReadIntentFromNestedListInstance, TestReadIntentFromNestedList, + ::testing::Values( + std::make_tuple(std::vector{}, std::list{}, true), + std::make_tuple(std::vector{}, std::list{}, false), + std::make_tuple(std::vector{}, std::list{1, 3}, true), + std::make_tuple(std::vector{}, std::list{1, 3}, false), + std::make_tuple(std::vector{0}, std::list{}, true), + std::make_tuple(std::vector{0}, std::list{}, false), + std::make_tuple(std::vector{0}, std::list{1, 3}, true), + std::make_tuple(std::vector{0}, std::list{1, 3}, false), + std::make_tuple(std::vector{1000}, std::list{1000}, true), + std::make_tuple(std::vector{1000}, std::list{1000}, false))); + + std::unique_ptr createNestedMapMemReader(MemoryOutputStream& memStream, + const std::vector& stripesToPrefetch, + const std::list& columnsToPrefetch, + bool prefetchTwice) { MemoryPool* pool = getDefaultPool(); auto type = std::unique_ptr( @@ -310,6 +372,7 @@ namespace orc { WriterOptions options; options.setStripeSize(1024 * 1024) .setCompressionBlockSize(1024) + .setMemoryBlockSize(64) .setCompression(CompressionKind_NONE) .setMemoryPool(pool) .setRowIndexStride(1000); @@ -387,20 +450,42 @@ namespace orc { auto inStream = std::make_unique(memStream.getData(), memStream.getLength()); ReaderOptions readerOptions; readerOptions.setMemoryPool(*pool); - return createReader(std::move(inStream), readerOptions); + auto reader = createReader(std::move(inStream), readerOptions); + + reader->preBuffer(stripesToPrefetch, columnsToPrefetch); + if (prefetchTwice) { + reader->preBuffer(stripesToPrefetch, columnsToPrefetch); + } + return reader; } - TEST(TestReadIntent, testMapAll) { + class TestReadIntentFromNestedMap + : public ::testing::TestWithParam< + std::tuple, std::list, bool>> {}; + + TEST_P(TestReadIntentFromNestedMap, testMapAll) { + const auto& params = GetParam(); + const std::vector& stripesToPrefetch = std::get<0>(params); + const std::list& columnsToPrefetch = std::get<1>(params); + bool prefetchTwice = std::get<2>(params); + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); - std::unique_ptr reader = createNestedMapMemReader(memStream); + std::unique_ptr reader = + createNestedMapMemReader(memStream, stripesToPrefetch, columnsToPrefetch, prefetchTwice); // select all of single_map. verifySelection(reader, {{2, ReadIntent_ALL}}, {0, 2, 3, 4}); } - TEST(TestReadIntent, testMapOffsets) { + TEST_P(TestReadIntentFromNestedMap, testMapOffsets) { + const auto& params = GetParam(); + const std::vector& stripesToPrefetch = std::get<0>(params); + const std::list& columnsToPrefetch = std::get<1>(params); + bool prefetchTwice = std::get<2>(params); + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); - std::unique_ptr reader = createNestedMapMemReader(memStream); + std::unique_ptr reader = + createNestedMapMemReader(memStream, stripesToPrefetch, columnsToPrefetch, prefetchTwice); // select only the offsets of single_map. verifySelection(reader, {{2, ReadIntent_OFFSETS}}, {0, 2}); @@ -412,17 +497,29 @@ namespace orc { verifySelection(reader, {{5, ReadIntent_OFFSETS}, {9, ReadIntent_OFFSETS}}, {0, 5, 7, 9}); } - TEST(TestReadIntent, testMapAllAndOffsets) { + TEST_P(TestReadIntentFromNestedMap, testMapAllAndOffsets) { + const auto& params = GetParam(); + const std::vector& stripesToPrefetch = std::get<0>(params); + const std::list& columnsToPrefetch = std::get<1>(params); + bool prefetchTwice = std::get<2>(params); + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); - std::unique_ptr reader = createNestedMapMemReader(memStream); + std::unique_ptr reader = + createNestedMapMemReader(memStream, stripesToPrefetch, columnsToPrefetch, prefetchTwice); // select all of single_map and only the outermost offsets of nested_map. verifySelection(reader, {{2, ReadIntent_ALL}, {5, ReadIntent_OFFSETS}}, {0, 2, 3, 4, 5}); } - TEST(TestReadIntent, testMapConflictingIntent) { + TEST_P(TestReadIntentFromNestedMap, testMapConflictingIntent) { + const auto& params = GetParam(); + const std::vector& stripesToPrefetch = std::get<0>(params); + const std::list& columnsToPrefetch = std::get<1>(params); + bool prefetchTwice = std::get<2>(params); + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); - std::unique_ptr reader = createNestedMapMemReader(memStream); + std::unique_ptr reader = + createNestedMapMemReader(memStream, stripesToPrefetch, columnsToPrefetch, prefetchTwice); // test conflicting ReadIntent on nested_map. verifySelection(reader, {{5, ReadIntent_OFFSETS}, {9, ReadIntent_ALL}}, {0, 5, 7, 9, 10, 11}); @@ -432,9 +529,15 @@ namespace orc { {0, 5, 7, 8, 9, 10, 11}); } - TEST(TestReadIntent, testMapRowBatchContent) { + TEST_P(TestReadIntentFromNestedMap, testMapRowBatchContent) { + const auto& params = GetParam(); + const std::vector& stripesToPrefetch = std::get<0>(params); + const std::list& columnsToPrefetch = std::get<1>(params); + bool prefetchTwice = std::get<2>(params); + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); - std::unique_ptr reader = createNestedMapMemReader(memStream); + std::unique_ptr reader = + createNestedMapMemReader(memStream, stripesToPrefetch, columnsToPrefetch, prefetchTwice); // select all of single_map and only the offsets of nested_map. RowReaderOptions::IdReadIntentMap idReadIntentMap = {{2, ReadIntent_ALL}, @@ -480,7 +583,24 @@ namespace orc { EXPECT_EQ(nullptr, nestedMapBatch.elements); } - std::unique_ptr createNestedUnionMemReader(MemoryOutputStream& memStream) { + INSTANTIATE_TEST_SUITE_P( + TestReadIntentFromNestedMapInstance, TestReadIntentFromNestedMap, + ::testing::Values( + std::make_tuple(std::vector{}, std::list{}, true), + std::make_tuple(std::vector{}, std::list{}, false), + std::make_tuple(std::vector{}, std::list{1, 5}, true), + std::make_tuple(std::vector{}, std::list{1, 5}, false), + std::make_tuple(std::vector{0}, std::list{}, true), + std::make_tuple(std::vector{0}, std::list{}, false), + std::make_tuple(std::vector{0}, std::list{1, 5}, true), + std::make_tuple(std::vector{0}, std::list{1, 5}, false), + std::make_tuple(std::vector{1000}, std::list{1000}, true), + std::make_tuple(std::vector{1000}, std::list{1000}, false))); + + std::unique_ptr createNestedUnionMemReader(MemoryOutputStream& memStream, + const std::vector& stripesToPrefetch, + const std::list& columnsToPrefetch, + bool prefetchTwice) { MemoryPool* pool = getDefaultPool(); auto type = std::unique_ptr( @@ -492,6 +612,7 @@ namespace orc { WriterOptions options; options.setStripeSize(1024 * 1024) .setCompressionBlockSize(1024) + .setMemoryBlockSize(64) .setCompression(CompressionKind_NONE) .setMemoryPool(pool) .setRowIndexStride(1000); @@ -563,20 +684,43 @@ namespace orc { ReaderOptions readerOptions; readerOptions.setMemoryPool(*pool); readerOptions.setReaderMetrics(nullptr); - return createReader(std::move(inStream), readerOptions); + auto reader = createReader(std::move(inStream), readerOptions); + + reader->preBuffer(stripesToPrefetch, columnsToPrefetch); + if (prefetchTwice) { + reader->preBuffer(stripesToPrefetch, columnsToPrefetch); + } + + return reader; } - TEST(TestReadIntent, testUnionAll) { + class TestReadIntentFromNestedUnion + : public ::testing::TestWithParam< + std::tuple, std::list, bool>> {}; + + TEST_P(TestReadIntentFromNestedUnion, testUnionAll) { + const auto& params = GetParam(); + const std::vector& stripesToPrefetch = std::get<0>(params); + const std::list& columnsToPrefetch = std::get<1>(params); + bool prefetchTwice = std::get<2>(params); + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); - std::unique_ptr reader = createNestedUnionMemReader(memStream); + std::unique_ptr reader = + createNestedUnionMemReader(memStream, stripesToPrefetch, columnsToPrefetch, prefetchTwice); // select all of single_union. verifySelection(reader, {{2, ReadIntent_ALL}}, {0, 2, 3, 4}); } - TEST(TestReadIntent, testUnionOffsets) { + TEST_P(TestReadIntentFromNestedUnion, testUnionOffsets) { + const auto& params = GetParam(); + const std::vector& stripesToPrefetch = std::get<0>(params); + const std::list& columnsToPrefetch = std::get<1>(params); + bool prefetchTwice = std::get<2>(params); + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); - std::unique_ptr reader = createNestedUnionMemReader(memStream); + std::unique_ptr reader = + createNestedUnionMemReader(memStream, stripesToPrefetch, columnsToPrefetch, prefetchTwice); // select only the offsets of single_union. verifySelection(reader, {{2, ReadIntent_OFFSETS}}, {0, 2}); @@ -589,17 +733,29 @@ namespace orc { {0, 2, 5, 6, 7, 8, 11}); } - TEST(TestReadIntent, testUnionAllAndOffsets) { + TEST_P(TestReadIntentFromNestedUnion, testUnionAllAndOffsets) { + const auto& params = GetParam(); + const std::vector& stripesToPrefetch = std::get<0>(params); + const std::list& columnsToPrefetch = std::get<1>(params); + bool prefetchTwice = std::get<2>(params); + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); - std::unique_ptr reader = createNestedUnionMemReader(memStream); + std::unique_ptr reader = + createNestedUnionMemReader(memStream, stripesToPrefetch, columnsToPrefetch, prefetchTwice); // select all of single_union and only the outermost offsets of nested_union. verifySelection(reader, {{2, ReadIntent_ALL}, {5, ReadIntent_OFFSETS}}, {0, 2, 3, 4, 5}); } - TEST(TestReadIntent, testUnionConflictingIntent) { + TEST_P(TestReadIntentFromNestedUnion, testUnionConflictingIntent) { + const auto& params = GetParam(); + const std::vector& stripesToPrefetch = std::get<0>(params); + const std::list& columnsToPrefetch = std::get<1>(params); + bool prefetchTwice = std::get<2>(params); + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); - std::unique_ptr reader = createNestedUnionMemReader(memStream); + std::unique_ptr reader = + createNestedUnionMemReader(memStream, stripesToPrefetch, columnsToPrefetch, prefetchTwice); // test conflicting ReadIntent on nested_union. verifySelection(reader, {{5, ReadIntent_OFFSETS}, {8, ReadIntent_ALL}}, @@ -610,9 +766,15 @@ namespace orc { {0, 5, 6, 7, 8, 9, 10, 11}); } - TEST(TestReadIntent, testUnionRowBatchContent) { + TEST_P(TestReadIntentFromNestedUnion, testUnionRowBatchContent) { + const auto& params = GetParam(); + const std::vector& stripesToPrefetch = std::get<0>(params); + const std::list& columnsToPrefetch = std::get<1>(params); + bool prefetchTwice = std::get<2>(params); + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); - std::unique_ptr reader = createNestedUnionMemReader(memStream); + std::unique_ptr reader = + createNestedUnionMemReader(memStream, stripesToPrefetch, columnsToPrefetch, prefetchTwice); // select all of single_union and only the offsets of nested_union. RowReaderOptions::IdReadIntentMap idReadIntentMap = {{2, ReadIntent_ALL}, @@ -662,10 +824,25 @@ namespace orc { EXPECT_EQ(1, nestedUnionBatch.offsets.data()[1]); } + INSTANTIATE_TEST_SUITE_P( + TestReadIntentFromNestedUnionInstance, TestReadIntentFromNestedUnion, + ::testing::Values( + std::make_tuple(std::vector{}, std::list{}, true), + std::make_tuple(std::vector{}, std::list{}, false), + std::make_tuple(std::vector{}, std::list{1, 2}, true), + std::make_tuple(std::vector{}, std::list{1, 2}, false), + std::make_tuple(std::vector{0}, std::list{}, true), + std::make_tuple(std::vector{0}, std::list{}, false), + std::make_tuple(std::vector{0}, std::list{1, 2}, true), + std::make_tuple(std::vector{0}, std::list{1, 2}, false), + std::make_tuple(std::vector{1000}, std::list{1000}, true), + std::make_tuple(std::vector{1000}, std::list{1000}, false))); + TEST(TestReadIntent, testSeekOverEmptyPresentStream) { MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); MemoryPool* pool = getDefaultPool(); uint64_t rowCount = 5000; + { auto type = std::unique_ptr( Type::buildTypeFromString("struct,col3:struct," @@ -673,6 +850,7 @@ namespace orc { WriterOptions options; options.setStripeSize(1024 * 1024) .setCompressionBlockSize(1024) + .setMemoryBlockSize(64) .setCompression(CompressionKind_NONE) .setMemoryPool(pool) .setRowIndexStride(1000); diff --git a/c++/test/TestRleEncoder.cc b/c++/test/TestRleEncoder.cc index 1c24a69515..d458236cbc 100644 --- a/c++/test/TestRleEncoder.cc +++ b/c++/test/TestRleEncoder.cc @@ -84,8 +84,8 @@ namespace orc { std::make_unique(memStream.getData(), memStream.getLength()), isSinged, version, *getDefaultPool(), getDefaultReaderMetrics()); - int64_t* decodedData = new int64_t[numValues]; - decoder->next(decodedData, numValues, notNull); + std::vector decodedData(numValues); + decoder->next(decodedData.data(), numValues, notNull); for (uint64_t i = 0; i < numValues; ++i) { if (!notNull || notNull[i]) { @@ -93,7 +93,12 @@ namespace orc { } } - delete[] decodedData; + decoder->next(decodedData.data(), numValues, notNull); + for (uint64_t i = 0; i < numValues; ++i) { + if (!notNull || notNull[i]) { + EXPECT_EQ(data[i], decodedData[i]); + } + } } std::unique_ptr RleTest::getEncoder(RleVersion version, MemoryOutputStream& memStream, @@ -128,6 +133,9 @@ namespace orc { char* notNull = numNulls == 0 ? nullptr : new char[numValues]; int64_t* data = new int64_t[numValues]; generateData(numValues, start, delta, random, data, numNulls, notNull); + encoder->add(data, numValues, notNull); + encoder->finishEncode(); + encoder->add(data, numValues, notNull); encoder->flush(); @@ -243,6 +251,9 @@ namespace orc { MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); std::unique_ptr encoder = getEncoder(RleVersion_2, memStream, isSigned); + encoder->add(data, numValues, nullptr); + encoder->finishEncode(); + encoder->add(data, numValues, nullptr); encoder->flush(); diff --git a/c++/test/TestSchemaEvolution.cc b/c++/test/TestSchemaEvolution.cc index c52ba009fa..d146853573 100644 --- a/c++/test/TestSchemaEvolution.cc +++ b/c++/test/TestSchemaEvolution.cc @@ -45,17 +45,17 @@ namespace orc { directEncoding.set_kind(proto::ColumnEncoding_Kind_DIRECT); EXPECT_CALL(streams, getEncoding(testing::_)).WillRepeatedly(testing::Return(directEncoding)); - EXPECT_CALL(streams, getStreamProxy(testing::_, testing::_, testing::_)) - .WillRepeatedly(testing::Return(nullptr)); - std::string dummyStream("dummy"); - ON_CALL(streams, getStreamProxy(1, proto::Stream_Kind_SECONDARY, testing::_)) - .WillByDefault(testing::Return( - new SeekableArrayInputStream(dummyStream.c_str(), dummyStream.length()))); + EXPECT_CALL(streams, getStreamProxy(testing::_, testing::_, testing::_)) + .WillRepeatedly(testing::ReturnNew(dummyStream.c_str(), + dummyStream.length())); + EXPECT_CALL(streams, isDecimalAsLong()).WillRepeatedly(testing::Return(false)); EXPECT_CALL(streams, getSchemaEvolution()).WillRepeatedly(testing::Return(&se)); + EXPECT_CALL(streams, getSelectedColumns()) + .WillRepeatedly(testing::Return(std::vector{true, true})); - EXPECT_TRUE(buildReader(*fileType, streams) != nullptr); + EXPECT_TRUE(buildReader(*fileType, streams, true) != nullptr); } return true; } @@ -66,8 +66,8 @@ namespace orc { {2, "struct"}, {3, "struct"}, {4, "struct"}, {5, "struct"}, {6, "struct"}, {7, "struct"}, - {8, "struct"}, {9, "struct"}, - {10, "struct"}, {11, "struct"}, + {8, "struct"}, {9, "struct"}, + {10, "struct"}, {11, "struct"}, {12, "struct"}, {13, "struct"}, {14, "struct"}, {15, "struct"}, {16, "struct"}}; @@ -148,6 +148,38 @@ namespace orc { } } + // conversion from string variant to numeric + for (size_t i = 7; i <= 11; i++) { + for (size_t j = 0; j <= 6; j++) { + canConvert[i][j] = true; + needConvert[i][j] = true; + } + } + + // conversion from string variant to string variant + for (size_t i = 7; i <= 11; i++) { + for (size_t j = 7; j <= 11; j++) { + canConvert[i][j] = true; + needConvert[i][j] = (i != j); + } + } + + // conversion from string variant to decimal + for (size_t i = 7; i <= 11; i++) { + for (size_t j = 12; j <= 13; j++) { + canConvert[i][j] = true; + needConvert[i][j] = (i != j); + } + } + + // conversion from string variant to timestamp + for (size_t i = 7; i <= 11; i++) { + for (size_t j = 14; j <= 15; j++) { + canConvert[i][j] = true; + needConvert[i][j] = (i != j); + } + } + for (size_t i = 0; i < typesSize; i++) { for (size_t j = 0; j < typesSize; j++) { testConvertReader(types[i], types[j], canConvert[i][j], needConvert[i][j]); diff --git a/c++/test/TestSearchArgument.cc b/c++/test/TestSearchArgument.cc index bf9b82ea5c..e51ee1e8b5 100644 --- a/c++/test/TestSearchArgument.cc +++ b/c++/test/TestSearchArgument.cc @@ -481,4 +481,13 @@ namespace orc { std::invalid_argument); } + TEST(TestSearchArgument, testBadTreeNode) { + auto invalidNode = std::make_shared(ExpressionTree::Operator::NOT, NodeList{}); + EXPECT_THROW(invalidNode->toString(), std::invalid_argument); + + std::vector leaves; + leaves.push_back(TruthValue::YES); + EXPECT_THROW(invalidNode->evaluate(leaves), std::invalid_argument); + } + } // namespace orc diff --git a/c++/test/TestStripeIndexStatistics.cc b/c++/test/TestStripeIndexStatistics.cc index 34a4649c35..a529792c17 100644 --- a/c++/test/TestStripeIndexStatistics.cc +++ b/c++/test/TestStripeIndexStatistics.cc @@ -46,18 +46,19 @@ namespace orc { intColStats = reinterpret_cast( stripeStats->getRowIndexStatistics(1, 0)); EXPECT_EQ( - "Data type: Integer\nValues: 2000\nHas null: no\nMinimum: 1\nMaximum: 2000\nSum: 2001000\n", + "Data type: Integer\nValues: 2000\nHas null: yes\nMinimum: 1\nMaximum: 2000\nSum: " + "2001000\n", intColStats->toString()); intColStats = reinterpret_cast( stripeStats->getRowIndexStatistics(1, 1)); EXPECT_EQ( - "Data type: Integer\nValues: 2000\nHas null: no\nMinimum: 2001\nMaximum: 4000\nSum: " + "Data type: Integer\nValues: 2000\nHas null: yes\nMinimum: 2001\nMaximum: 4000\nSum: " "6001000\n", intColStats->toString()); intColStats = reinterpret_cast( stripeStats->getRowIndexStatistics(1, 2)); EXPECT_EQ( - "Data type: Integer\nValues: 2000\nHas null: no\nMinimum: 4001\nMaximum: 6000\nSum: " + "Data type: Integer\nValues: 2000\nHas null: yes\nMinimum: 4001\nMaximum: 6000\nSum: " "10001000\n", intColStats->toString()); @@ -65,23 +66,48 @@ namespace orc { stringColStats = reinterpret_cast( stripeStats->getRowIndexStatistics(2, 0)); EXPECT_EQ( - "Data type: String\nValues: 2000\nHas null: no\nMinimum: 1000\nMaximum: 9a\nTotal length: " + "Data type: String\nValues: 2000\nHas null: yes\nMinimum: 1000\nMaximum: 9a\nTotal length: " "7892\n", stringColStats->toString()); stringColStats = reinterpret_cast( stripeStats->getRowIndexStatistics(2, 1)); EXPECT_EQ( - "Data type: String\nValues: 2000\nHas null: no\nMinimum: 2001\nMaximum: 4000\nTotal " + "Data type: String\nValues: 2000\nHas null: yes\nMinimum: 2001\nMaximum: 4000\nTotal " "length: " "8000\n", stringColStats->toString()); stringColStats = reinterpret_cast( stripeStats->getRowIndexStatistics(2, 2)); EXPECT_EQ( - "Data type: String\nValues: 2000\nHas null: no\nMinimum: 4001\nMaximum: 6000\nTotal " + "Data type: String\nValues: 2000\nHas null: yes\nMinimum: 4001\nMaximum: 6000\nTotal " "length: " "8000\n", stringColStats->toString()); + + std::unique_ptr stripeLevelStats = reader->getStripeStatistics(0, false); + const orc::IntegerColumnStatistics* stripeLevelIntColStats; + stripeLevelIntColStats = reinterpret_cast( + stripeLevelStats->getColumnStatistics(1)); + EXPECT_EQ( + "Data type: Integer\nValues: 6000\nHas null: yes\nMinimum: 1\nMaximum: 6000\nSum: " + "18003000\n", + stripeLevelIntColStats->toString()); + + const orc::StringColumnStatistics* stripeLevelStringColStats; + stripeLevelStringColStats = reinterpret_cast( + stripeLevelStats->getColumnStatistics(2)); + EXPECT_EQ( + "Data type: String\nValues: 6000\nHas null: yes\nMinimum: 1000\nMaximum: 9a\nTotal length: " + "23892\n", + stripeLevelStringColStats->toString()); + + intColStats = + reinterpret_cast(stripeStats->getColumnStatistics(1)); + stringColStats = + reinterpret_cast(stripeStats->getColumnStatistics(2)); + + EXPECT_EQ(intColStats->toString(), stripeLevelIntColStats->toString()); + EXPECT_EQ(stringColStats->toString(), stripeLevelStringColStats->toString()); } } // namespace orc diff --git a/c++/test/TestTimestampStatistics.cc b/c++/test/TestTimestampStatistics.cc index d20a049557..e005fa6cf6 100644 --- a/c++/test/TestTimestampStatistics.cc +++ b/c++/test/TestTimestampStatistics.cc @@ -68,6 +68,19 @@ namespace orc { "00:00:00.688\nLowerBound: 1995-01-01 00:00:00.688\nMaximum: 2037-01-01 " "00:00:00.0\nUpperBound: 2037-01-01 00:00:00.1\n", stripeColStats->toString()); + + std::unique_ptr stripeStatsWithOutRowIndex = + reader->getStripeStatistics(0, false); + const orc::TimestampColumnStatistics* stripeColStatsOnly = + reinterpret_cast( + stripeStatsWithOutRowIndex->getColumnStatistics(0)); + + EXPECT_TRUE(stripeColStatsOnly->hasMinimum()); + EXPECT_TRUE(stripeColStatsOnly->hasMaximum()); + EXPECT_EQ(stripeColStats->toString(), stripeColStatsOnly->toString()); + EXPECT_EQ(stripeStats->getNumberOfColumns(), stripeStatsWithOutRowIndex->getNumberOfColumns()); + EXPECT_THROW(stripeStatsWithOutRowIndex->getRowIndexStatistics(1, 1), NotImplementedYet); + EXPECT_THROW(stripeStatsWithOutRowIndex->getNumberOfRowIndexStats(1), NotImplementedYet); } TEST(TestTimestampStatistics, testTimezoneUTC) { diff --git a/c++/test/TestTimezone.cc b/c++/test/TestTimezone.cc index 2330fcfb04..94895cd700 100644 --- a/c++/test/TestTimezone.cc +++ b/c++/test/TestTimezone.cc @@ -21,6 +21,7 @@ #include "wrap/gmock.h" #include "wrap/gtest-wrapper.h" +#include #include #include #include @@ -421,20 +422,61 @@ namespace orc { } TEST(TestTimezone, testMissingTZDB) { - const char* tzDirBackup = std::getenv("TZDIR"); - if (tzDirBackup != nullptr) { + const char* tzDir = std::getenv("TZDIR"); + std::string tzDirBackup; + if (tzDir != nullptr) { + // std::string creates a deepcopy of buffer, which avoids that + // unsetting environment variable wrecks pointer to tzDir + tzDirBackup = tzDir; ASSERT_TRUE(delEnv("TZDIR")); } ASSERT_TRUE(setEnv("TZDIR", "/path/to/wrong/tzdb")); - EXPECT_THAT([]() { getTimezoneByName("America/Los_Angeles"); }, + EXPECT_THAT([]() { getTimezoneByName("America/Los_Angeles").getVersion(); }, testing::ThrowsMessage(testing::HasSubstr( "Time zone file /path/to/wrong/tzdb/America/Los_Angeles does not exist." " Please install IANA time zone database and set TZDIR env."))); - if (tzDirBackup != nullptr) { - ASSERT_TRUE(setEnv("TZDIR", tzDirBackup)); + if (!tzDirBackup.empty()) { + ASSERT_TRUE(setEnv("TZDIR", tzDirBackup.c_str())); } else { ASSERT_TRUE(delEnv("TZDIR")); } } + TEST(TestTimezone, testTzdbFromCondaEnv) { + const char* tzDir = std::getenv("TZDIR"); + // test only makes sense if TZDIR exists + if (tzDir != nullptr) { + std::string tzDirBackup = tzDir; + ASSERT_TRUE(delEnv("TZDIR")); + + // remove "/share/zoneinfo" from TZDIR (as set through TZDATA_DIR in CI) to + // get the equivalent of CONDA_PREFIX, relative to the location of the tzdb + std::string condaPrefix(tzDirBackup); + condaPrefix += "/../.."; + ASSERT_TRUE(setEnv("CONDA_PREFIX", condaPrefix.c_str())); + + // small test sample to ensure tzbd loads with CONDA_PREFIX, even without TZDIR + const Timezone* zrh = &getTimezoneByName("Europe/Zurich"); + EXPECT_EQ("CET", getVariantFromZone(*zrh, "2024-03-31 00:59:59")); + EXPECT_EQ("CEST", getVariantFromZone(*zrh, "2024-03-31 01:00:00")); + EXPECT_EQ("CEST", getVariantFromZone(*zrh, "2024-10-27 00:59:59")); + EXPECT_EQ("CET", getVariantFromZone(*zrh, "2024-10-27 01:00:00")); + + // CONDA_PREFIX contains backslashes on windows; test that this doesn't blow up + std::replace(condaPrefix.begin(), condaPrefix.end(), '/', '\\'); + ASSERT_TRUE(setEnv("CONDA_PREFIX", condaPrefix.c_str())); + + // as above, but different timezone to avoid hitting cache + const Timezone* syd = &getTimezoneByName("Australia/Sydney"); + EXPECT_EQ("AEDT", getVariantFromZone(*syd, "2024-04-06 15:59:59")); + EXPECT_EQ("AEST", getVariantFromZone(*syd, "2024-04-06 16:00:00")); + EXPECT_EQ("AEST", getVariantFromZone(*syd, "2024-10-05 15:59:59")); + EXPECT_EQ("AEDT", getVariantFromZone(*syd, "2024-10-05 16:00:00")); + + // restore state of environment variables + ASSERT_TRUE(delEnv("CONDA_PREFIX")); + ASSERT_TRUE(setEnv("TZDIR", tzDirBackup.c_str())); + } + } + } // namespace orc diff --git a/c++/test/TestType.cc b/c++/test/TestType.cc index c9ac2f2850..cec0d8d2c4 100644 --- a/c++/test/TestType.cc +++ b/c++/test/TestType.cc @@ -325,7 +325,7 @@ namespace orc { expectLogicErrorDuringParse("int<>", "Invalid < after int type."); expectLogicErrorDuringParse("array(int)", "Missing < after array."); expectLogicErrorDuringParse("struct>", - "Invalid struct type. No field name set."); + "Invalid struct type. Field name can not contain '<'."); expectLogicErrorDuringParse("struct", "Missing comma after field."); } diff --git a/c++/test/TestWriter.cc b/c++/test/TestWriter.cc index d160f82ff1..975462e30c 100644 --- a/c++/test/TestWriter.cc +++ b/c++/test/TestWriter.cc @@ -41,11 +41,11 @@ namespace orc { const int DEFAULT_MEM_STREAM_SIZE = 100 * 1024 * 1024; // 100M - std::unique_ptr createWriter(uint64_t stripeSize, uint64_t compresionblockSize, - CompressionKind compression, const Type& type, - MemoryPool* memoryPool, OutputStream* stream, - FileVersion version, uint64_t stride = 0, - const std::string& timezone = "GMT", + std::unique_ptr createWriter(uint64_t stripeSize, uint64_t memoryBlockSize, + uint64_t compresionblockSize, CompressionKind compression, + const Type& type, MemoryPool* memoryPool, + OutputStream* stream, FileVersion version, + uint64_t stride = 0, const std::string& timezone = "GMT", bool useTightNumericVector = false) { WriterOptions options; options.setStripeSize(stripeSize); @@ -56,6 +56,9 @@ namespace orc { options.setFileVersion(version); options.setTimezoneName(timezone); options.setUseTightNumericVector(useTightNumericVector); + options.setMemoryBlockSize(memoryBlockSize); + // enable align block bound to row group when stride is not 0 + options.setAlignBlockBoundToRowGroup(true); return createWriter(type, stream, options); } @@ -83,7 +86,56 @@ namespace orc { return reader->createRowReader(rowReaderOpts); } - class WriterTest : public TestWithParam { + void verifyCompressionBlockAlignment(std::unique_ptr& reader, uint64_t columnCount) { + auto stripeCount = reader->getNumberOfStripes(); + for (uint64_t stripeIndex = 0; stripeIndex < stripeCount; ++stripeIndex) { + for (uint64_t i = 0; i < columnCount; ++i) { + auto rowGroupIndexMap = reader->getRowGroupIndex(stripeIndex); + EXPECT_TRUE(rowGroupIndexMap.size() > 0); + auto rowGroupIndex = rowGroupIndexMap[columnCount]; + auto subType = reader->getType().getSubtype(i); + EXPECT_TRUE(rowGroupIndex.positions.size() > 0); + for (auto rowGroupPositions : rowGroupIndex.positions) { + for (uint64_t posIndex = 0; posIndex < rowGroupPositions.size(); ++posIndex) { + // After we call finishStream(), unusedBufferSize is set to 0, + // so only the first position is valid in each recordPosition call. + switch (subType->getKind()) { + case DECIMAL: + case STRING: + case BINARY: + case CHAR: + case VARCHAR: { + if (posIndex != 0 && posIndex != 2) { + EXPECT_EQ(rowGroupPositions[posIndex], 0); + } + break; + } + case TIMESTAMP_INSTANT: + case TIMESTAMP: { + if (posIndex != 0 && posIndex != 3) { + EXPECT_EQ(rowGroupPositions[posIndex], 0); + } + break; + } + default: { + if (posIndex != 0) { + EXPECT_EQ(rowGroupPositions[posIndex], 0); + } + break; + } + } + } + } + } + } + } + + struct TestParams { + FileVersion fileVersion; + bool enableAlignBlockBoundToRowGroup; + }; + + class WriterTest : public TestWithParam { // You can implement all the usual fixture class members here. // To access the test parameter, call GetParam() from class // TestWithParam. @@ -91,13 +143,15 @@ namespace orc { protected: FileVersion fileVersion; + bool enableAlignBlockBoundToRowGroup; public: - WriterTest() : fileVersion(FileVersion::v_0_11()) {} + WriterTest() : fileVersion(FileVersion::v_0_11()), enableAlignBlockBoundToRowGroup(false) {} }; void WriterTest::SetUp() { - fileVersion = GetParam(); + fileVersion = GetParam().fileVersion; + enableAlignBlockBoundToRowGroup = GetParam().enableAlignBlockBoundToRowGroup; } TEST_P(WriterTest, writeEmptyFile) { @@ -107,10 +161,11 @@ namespace orc { uint64_t stripeSize = 16 * 1024; // 16K uint64_t compressionBlockSize = 1024; // 1k + uint64_t memoryBlockSize = 64; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion); writer->close(); auto inStream = std::make_unique(memStream.getData(), memStream.getLength()); @@ -135,10 +190,11 @@ namespace orc { uint64_t stripeSize = 16 * 1024; // 16K uint64_t compressionBlockSize = 1024; // 1k + uint64_t memoryBlockSize = 64; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion); std::unique_ptr batch = writer->createRowBatch(1024); StructVectorBatch* structBatch = dynamic_cast(batch.get()); LongVectorBatch* longBatch = dynamic_cast(structBatch->fields[0]); @@ -195,10 +251,11 @@ namespace orc { uint64_t stripeSize = 1024; // 1K uint64_t compressionBlockSize = 1024; // 1k + uint64_t memoryBlockSize = 64; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion); std::unique_ptr batch = writer->createRowBatch(65535); StructVectorBatch* structBatch = dynamic_cast(batch.get()); LongVectorBatch* longBatch = dynamic_cast(structBatch->fields[0]); @@ -241,13 +298,14 @@ namespace orc { uint64_t stripeSize = 1024; // 1K uint64_t compressionBlockSize = 1024; // 1k + uint64_t memoryBlockSize = 64; char dataBuffer[327675]; uint64_t offset = 0; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion, enableAlignBlockBoundToRowGroup ? 1024 : 0); std::unique_ptr batch = writer->createRowBatch(65535); StructVectorBatch* structBatch = dynamic_cast(batch.get()); StringVectorBatch* strBatch = dynamic_cast(structBatch->fields[0]); @@ -289,6 +347,9 @@ namespace orc { EXPECT_EQ(i, static_cast(atoi(str.c_str()))); EXPECT_EQ(i, static_cast(atoi(bin.c_str()))); } + if (enableAlignBlockBoundToRowGroup) { + verifyCompressionBlockAlignment(reader, type->getSubtypeCount()); + } EXPECT_FALSE(rowReader->next(*batch)); } @@ -301,6 +362,7 @@ namespace orc { uint64_t stripeSize = 16 * 1024; uint64_t compressionBlockSize = 1024; uint64_t rowCount = 655350; + uint64_t memoryBlockSize = 64; std::vector data(rowCount); for (uint64_t i = 0; i < rowCount; ++i) { @@ -308,8 +370,8 @@ namespace orc { } std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion, enableAlignBlockBoundToRowGroup ? 1024 : 0); std::unique_ptr batch = writer->createRowBatch(rowCount); StructVectorBatch* structBatch = dynamic_cast(batch.get()); DoubleVectorBatch* doubleBatch = dynamic_cast(structBatch->fields[0]); @@ -345,6 +407,10 @@ namespace orc { 0.000001f); } EXPECT_FALSE(rowReader->next(*batch)); + + if (enableAlignBlockBoundToRowGroup) { + verifyCompressionBlockAlignment(reader, type->getSubtypeCount()); + } } TEST_P(WriterTest, writeShortIntLong) { @@ -356,10 +422,11 @@ namespace orc { uint64_t stripeSize = 16 * 1024; uint64_t compressionBlockSize = 1024; uint64_t rowCount = 65535; + uint64_t memoryBlockSize = 64; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion, enableAlignBlockBoundToRowGroup ? 1024 : 0); std::unique_ptr batch = writer->createRowBatch(rowCount); StructVectorBatch* structBatch = dynamic_cast(batch.get()); LongVectorBatch* smallIntBatch = dynamic_cast(structBatch->fields[0]); @@ -396,6 +463,9 @@ namespace orc { EXPECT_EQ(static_cast(i), intBatch->data[i]); EXPECT_EQ(static_cast(i), bigIntBatch->data[i]); } + if (enableAlignBlockBoundToRowGroup) { + verifyCompressionBlockAlignment(reader, type->getSubtypeCount()); + } } TEST_P(WriterTest, writeTinyint) { @@ -406,16 +476,20 @@ namespace orc { uint64_t stripeSize = 16 * 1024; uint64_t compressionBlockSize = 1024; uint64_t rowCount = 65535; + uint64_t memoryBlockSize = 64; - std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + std::unique_ptr writer = createWriter( + stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZSTD, *type, pool, + &memStream, fileVersion, enableAlignBlockBoundToRowGroup ? 1024 : 0, "GMT", true); std::unique_ptr batch = writer->createRowBatch(rowCount); StructVectorBatch* structBatch = dynamic_cast(batch.get()); - LongVectorBatch* byteBatch = dynamic_cast(structBatch->fields[0]); + ByteVectorBatch* byteBatch = dynamic_cast(structBatch->fields[0]); + int64_t sum = 0; for (uint64_t i = 0; i < rowCount; ++i) { - byteBatch->data[i] = static_cast(i); + int8_t x = static_cast(i); + byteBatch->data[i] = x; + sum += x; } structBatch->numElements = rowCount; byteBatch->numElements = rowCount; @@ -429,13 +503,29 @@ namespace orc { EXPECT_EQ(rowCount, reader->getNumberOfRows()); batch = rowReader->createRowBatch(rowCount); + rowReader->seekToRow(20); EXPECT_EQ(true, rowReader->next(*batch)); + if (enableAlignBlockBoundToRowGroup) { + verifyCompressionBlockAlignment(reader, type->getSubtypeCount()); + } structBatch = dynamic_cast(batch.get()); - byteBatch = dynamic_cast(structBatch->fields[0]); - for (uint64_t i = 0; i < rowCount; ++i) { - EXPECT_EQ(static_cast(i), static_cast(byteBatch->data[i])); - } + auto outByteBatch = dynamic_cast(structBatch->fields[0]); + for (uint64_t i = 0; i < rowCount - 20; ++i) { + EXPECT_EQ(static_cast(i + 20), static_cast(outByteBatch->data[i])); + } + + auto col_stats = reader->getColumnStatistics(1); + ASSERT_NE(col_stats, nullptr); + EXPECT_EQ(col_stats->getNumberOfValues(), rowCount); + EXPECT_FALSE(col_stats->hasNull()); + auto int_stats = dynamic_cast(col_stats.get()); + ASSERT_NE(int_stats, nullptr); + EXPECT_TRUE(int_stats->hasMinimum() && int_stats->hasMaximum()); + EXPECT_EQ(int_stats->getMinimum(), -128); + EXPECT_EQ(int_stats->getMaximum(), 127); + EXPECT_TRUE(int_stats->hasSum()); + EXPECT_EQ(int_stats->getSum(), sum); } TEST_P(WriterTest, writeBooleanColumn) { @@ -446,10 +536,11 @@ namespace orc { uint64_t stripeSize = 16 * 1024; uint64_t compressionBlockSize = 1024; uint64_t rowCount = 65535; + uint64_t memoryBlockSize = 64; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion, enableAlignBlockBoundToRowGroup ? 1024 : 0); std::unique_ptr batch = writer->createRowBatch(rowCount); StructVectorBatch* structBatch = dynamic_cast(batch.get()); LongVectorBatch* byteBatch = dynamic_cast(structBatch->fields[0]); @@ -476,6 +567,9 @@ namespace orc { for (uint64_t i = 0; i < rowCount; ++i) { EXPECT_EQ((i % 3) == 0 ? 1 : 0, byteBatch->data[i]); } + if (enableAlignBlockBoundToRowGroup) { + verifyCompressionBlockAlignment(reader, type->getSubtypeCount()); + } } TEST_P(WriterTest, writeDate) { @@ -486,10 +580,11 @@ namespace orc { uint64_t stripeSize = 16 * 1024; uint64_t compressionBlockSize = 1024; uint64_t rowCount = 1024; + uint64_t memoryBlockSize = 64; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion, enableAlignBlockBoundToRowGroup ? 1024 : 0); std::unique_ptr batch = writer->createRowBatch(rowCount); StructVectorBatch* structBatch = dynamic_cast(batch.get()); @@ -517,6 +612,9 @@ namespace orc { for (uint64_t i = 0; i < rowCount; ++i) { EXPECT_EQ(static_cast(i), longBatch->data[i]); } + if (enableAlignBlockBoundToRowGroup) { + verifyCompressionBlockAlignment(reader, type->getSubtypeCount()); + } } TEST_P(WriterTest, writeTimestamp) { @@ -527,10 +625,11 @@ namespace orc { uint64_t stripeSize = 16 * 1024; uint64_t compressionBlockSize = 1024; uint64_t rowCount = 102400; + uint64_t memoryBlockSize = 64; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion, enableAlignBlockBoundToRowGroup ? 1024 : 0); std::unique_ptr batch = writer->createRowBatch(rowCount); StructVectorBatch* structBatch = dynamic_cast(batch.get()); TimestampVectorBatch* tsBatch = dynamic_cast(structBatch->fields[0]); @@ -562,14 +661,18 @@ namespace orc { EXPECT_EQ(times[i], tsBatch->data[i]); EXPECT_EQ(i * 1000, tsBatch->nanoseconds[i]); } + if (enableAlignBlockBoundToRowGroup) { + verifyCompressionBlockAlignment(reader, type->getSubtypeCount()); + } } TEST_P(WriterTest, writeNegativeTimestamp) { MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); MemoryPool* pool = getDefaultPool(); std::unique_ptr type(Type::buildTypeFromString("struct")); - auto writer = createWriter(16 * 1024 * 1024, 64 * 1024, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + auto writer = + createWriter(16 * 1024 * 1024, 64 * 1024, 256 * 1024, CompressionKind_ZLIB, *type, pool, + &memStream, fileVersion, enableAlignBlockBoundToRowGroup ? 1024 : 0); uint64_t batchCount = 5; auto batch = writer->createRowBatch(batchCount * 2); auto structBatch = dynamic_cast(batch.get()); @@ -619,6 +722,10 @@ namespace orc { } EXPECT_EQ(1000000, tsBatch->nanoseconds[i]); } + + if (enableAlignBlockBoundToRowGroup) { + verifyCompressionBlockAlignment(reader, type->getSubtypeCount()); + } } // TODO: Disable the test below for Windows for following reasons: @@ -638,10 +745,11 @@ namespace orc { uint64_t stripeSize = 16 * 1024; uint64_t compressionBlockSize = 1024; uint64_t rowCount = 1; + uint64_t memoryBlockSize = 64; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion, 0, writerTimezone); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion, 0, writerTimezone); auto batch = writer->createRowBatch(rowCount); auto& structBatch = dynamic_cast(*batch); auto& tsBatch = dynamic_cast(*structBatch.fields[0]); @@ -734,10 +842,11 @@ namespace orc { uint64_t stripeSize = 16 * 1024; uint64_t compressionBlockSize = 1024; uint64_t rowCount = 102400; + uint64_t memoryBlockSize = 64; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion, enableAlignBlockBoundToRowGroup ? 1024 : 0); std::unique_ptr batch = writer->createRowBatch(rowCount); StructVectorBatch* structBatch = dynamic_cast(batch.get()); TimestampVectorBatch* tsBatch = dynamic_cast(structBatch->fields[0]); @@ -769,6 +878,9 @@ namespace orc { EXPECT_EQ(times[i], tsBatch->data[i]); EXPECT_EQ(i * 1000, tsBatch->nanoseconds[i]); } + if (enableAlignBlockBoundToRowGroup) { + verifyCompressionBlockAlignment(reader, type->getSubtypeCount()); + } } TEST_P(WriterTest, writeCharAndVarcharColumn) { @@ -779,13 +891,14 @@ namespace orc { uint64_t stripeSize = 1024; uint64_t compressionBlockSize = 1024; uint64_t rowCount = 65535; + uint64_t memoryBlockSize = 64; char dataBuffer[327675]; uint64_t offset = 0; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion, enableAlignBlockBoundToRowGroup ? 1024 : 0); std::unique_ptr batch = writer->createRowBatch(rowCount); StructVectorBatch* structBatch = dynamic_cast(batch.get()); @@ -847,6 +960,9 @@ namespace orc { } EXPECT_FALSE(rowReader->next(*batch)); + if (enableAlignBlockBoundToRowGroup) { + verifyCompressionBlockAlignment(reader, type->getSubtypeCount()); + } } TEST_P(WriterTest, writeDecimal64Column) { @@ -858,10 +974,11 @@ namespace orc { uint64_t stripeSize = 16 * 1024; // 16K uint64_t compressionBlockSize = 1024; // 1k uint64_t rowCount = 1024; + uint64_t memoryBlockSize = 64; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion, enableAlignBlockBoundToRowGroup ? 1024 : 0); std::unique_ptr batch = writer->createRowBatch(rowCount); StructVectorBatch* structBatch = dynamic_cast(batch.get()); Decimal64VectorBatch* decBatch = dynamic_cast(structBatch->fields[0]); @@ -923,6 +1040,9 @@ namespace orc { EXPECT_EQ(dec, decBatch->values[i]); EXPECT_EQ(-dec, decBatch->values[i + maxPrecision]); } + if (enableAlignBlockBoundToRowGroup) { + verifyCompressionBlockAlignment(reader, type->getSubtypeCount()); + } } TEST_P(WriterTest, writeDecimal128Column) { @@ -934,10 +1054,11 @@ namespace orc { uint64_t stripeSize = 16 * 1024; uint64_t compressionBlockSize = 1024; uint64_t rowCount = 1024; + uint64_t memoryBlockSize = 64; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion, enableAlignBlockBoundToRowGroup ? 1024 : 0); std::unique_ptr batch = writer->createRowBatch(rowCount); StructVectorBatch* structBatch = dynamic_cast(batch.get()); Decimal128VectorBatch* decBatch = dynamic_cast(structBatch->fields[0]); @@ -1009,6 +1130,9 @@ namespace orc { EXPECT_EQ(expected, decBatch->values[i].toString()); EXPECT_EQ("-" + expected, decBatch->values[i + maxPrecision].toString()); } + if (enableAlignBlockBoundToRowGroup) { + verifyCompressionBlockAlignment(reader, type->getSubtypeCount()); + } } TEST_P(WriterTest, writeListColumn) { @@ -1022,10 +1146,11 @@ namespace orc { uint64_t rowCount = 1024; uint64_t maxListLength = 10; uint64_t offset = 0; + uint64_t memoryBlockSize = 8 * 1024; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion, enableAlignBlockBoundToRowGroup ? 1024 : 0); std::unique_ptr batch = writer->createRowBatch(rowCount * maxListLength); StructVectorBatch* structBatch = dynamic_cast(batch.get()); @@ -1071,6 +1196,9 @@ namespace orc { EXPECT_EQ(static_cast(i), data[offsets[i] + j]); } } + if (enableAlignBlockBoundToRowGroup) { + verifyCompressionBlockAlignment(reader, type->getSubtypeCount()); + } } TEST_P(WriterTest, writeMapColumn) { @@ -1081,10 +1209,11 @@ namespace orc { uint64_t stripeSize = 16 * 1024; uint64_t compressionBlockSize = 1024; uint64_t rowCount = 1024, maxListLength = 10, offset = 0; + uint64_t memoryBlockSize = 64; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion, enableAlignBlockBoundToRowGroup ? 1024 : 0); std::unique_ptr batch = writer->createRowBatch(rowCount * maxListLength); StructVectorBatch* structBatch = dynamic_cast(batch.get()); MapVectorBatch* mapBatch = dynamic_cast(structBatch->fields[0]); @@ -1151,6 +1280,9 @@ namespace orc { EXPECT_EQ(static_cast(i), elemData[offsets[i] + j]); } } + if (enableAlignBlockBoundToRowGroup) { + verifyCompressionBlockAlignment(reader, type->getSubtypeCount()); + } } TEST_P(WriterTest, writeUnionColumn) { @@ -1162,10 +1294,11 @@ namespace orc { uint64_t stripeSize = 16 * 1024; uint64_t compressionBlockSize = 1024; uint64_t rowCount = 3333; + uint64_t memoryBlockSize = 64; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion, enableAlignBlockBoundToRowGroup ? 1024 : 0); std::unique_ptr batch = writer->createRowBatch(rowCount); StructVectorBatch* structBatch = dynamic_cast(batch.get()); UnionVectorBatch* unionBatch = dynamic_cast(structBatch->fields[0]); @@ -1247,6 +1380,9 @@ namespace orc { break; } } + if (enableAlignBlockBoundToRowGroup) { + verifyCompressionBlockAlignment(reader, type->getSubtypeCount()); + } } TEST_P(WriterTest, writeUTF8CharAndVarcharColumn) { @@ -1257,9 +1393,10 @@ namespace orc { uint64_t stripeSize = 1024; uint64_t compressionBlockSize = 1024; uint64_t rowCount = 3; + uint64_t memoryBlockSize = 64; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion, enableAlignBlockBoundToRowGroup ? 1024 : 0); std::unique_ptr batch = writer->createRowBatch(rowCount); StructVectorBatch* structBatch = dynamic_cast(batch.get()); StringVectorBatch* charBatch = dynamic_cast(structBatch->fields[0]); @@ -1317,6 +1454,9 @@ namespace orc { EXPECT_TRUE(memcmp(varcharBatch->data[2], expectedTwoChars, 4) == 0); EXPECT_FALSE(rowReader->next(*batch)); + if (enableAlignBlockBoundToRowGroup) { + verifyCompressionBlockAlignment(reader, type->getSubtypeCount()); + } } TEST_P(WriterTest, testWriteListColumnWithNull) { @@ -1326,10 +1466,11 @@ namespace orc { uint64_t stripeSize = 1024; uint64_t compressionBlockSize = 1024; + uint64_t memoryBlockSize = 64; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion); std::unique_ptr batch = writer->createRowBatch(4); StructVectorBatch* structBatch = dynamic_cast(batch.get()); @@ -1407,10 +1548,11 @@ namespace orc { uint64_t stripeSize = 1024; uint64_t compressionBlockSize = 1024; + uint64_t memoryBlockSize = 64; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion); // test data looks like below - // {0} @@ -1485,12 +1627,13 @@ namespace orc { uint64_t stripeSize = 1024; uint64_t compressionBlockSize = 1024; + uint64_t memoryBlockSize = 64; // 10000 rows with every 1000 row as an RG // Each RG has 100 null rows except that the 5th RG is all null std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion, 1000); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion, 1000); std::unique_ptr batch = writer->createRowBatch(10000); StructVectorBatch* structBatch = dynamic_cast(batch.get()); @@ -1622,12 +1765,13 @@ namespace orc { TEST_P(WriterTest, testBloomFilter) { WriterOptions options; options.setStripeSize(1024) - .setCompressionBlockSize(64) + .setCompressionBlockSize(1024) .setCompression(CompressionKind_ZSTD) .setMemoryPool(getDefaultPool()) .setRowIndexStride(10000) .setFileVersion(fileVersion) - .setColumnsUseBloomFilter({1, 2, 3}); + .setColumnsUseBloomFilter({1, 2, 3}) + .setMemoryBlockSize(64); // write 65535 rows of data MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); @@ -1716,7 +1860,7 @@ namespace orc { auto type = std::unique_ptr(Type::buildTypeFromString("struct")); WriterOptions options; options.setStripeSize(1024 * 1024) - .setCompressionBlockSize(1024) + .setMemoryBlockSize(1024) .setCompression(CompressionKind_NONE) .setMemoryPool(pool) .setRowIndexStride(1000); @@ -1809,8 +1953,11 @@ namespace orc { uint64_t rowCount = 5000000; auto type = std::unique_ptr(Type::buildTypeFromString("struct")); WriterOptions options; - options.setStripeSize(1024).setCompressionBlockSize(1024).setCompression(kind).setMemoryPool( - pool); + options.setStripeSize(1024) + .setCompressionBlockSize(1024) + .setMemoryBlockSize(64) + .setCompression(kind) + .setMemoryPool(pool); auto writer = createWriter(*type, &memStream, options); auto batch = writer->createRowBatch(rowCount); @@ -1853,10 +2000,11 @@ namespace orc { WriterOptions options; options.setStripeSize(1024 * 1024) .setCompressionBlockSize(64 * 1024) + .setMemoryBlockSize(1024) .setCompression(CompressionKind_NONE) .setMemoryPool(pool) .setRowIndexStride(1000) - .setOutputBufferCapacity(capacity); + .setCompressionBlockSize(capacity); auto writer = createWriter(*type, &memStream, options); auto batch = writer->createRowBatch(rowCount); @@ -1913,6 +2061,7 @@ namespace orc { uint64_t stripeSize = 16 * 1024; uint64_t compressionBlockSize = 1024; uint64_t rowCount = 65530; + uint64_t memoryBlockSize = 64; std::vector data(rowCount); for (uint64_t i = 0; i < rowCount; ++i) { @@ -1920,8 +2069,8 @@ namespace orc { } std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion, 0, "GMT", true); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion, 0, "GMT", true); // start from here/ std::unique_ptr batch = writer->createRowBatch(rowCount / 2); StructVectorBatch* structBatch = dynamic_cast(batch.get()); @@ -2010,10 +2159,11 @@ namespace orc { uint64_t stripeSize = 1024; // 1K uint64_t compressionBlockSize = 1024; // 1k + uint64_t memoryBlockSize = 64; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion); std::unique_ptr batch = writer->createRowBatch(65535); StructVectorBatch* structBatch = dynamic_cast(batch.get()); LongVectorBatch* longBatch = dynamic_cast(structBatch->fields[0]); @@ -2065,10 +2215,11 @@ namespace orc { uint64_t stripeSize = 1024; // 1K uint64_t compressionBlockSize = 1024; // 1k + uint64_t memoryBlockSize = 64; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion); std::unique_ptr batch = writer->createRowBatch(65535); StructVectorBatch* structBatch = dynamic_cast(batch.get()); LongVectorBatch* longBatch = dynamic_cast(structBatch->fields[0]); @@ -2131,6 +2282,7 @@ namespace orc { WriterOptions options; options.setStripeSize(16 * 1024) .setCompressionBlockSize(1024) + .setMemoryBlockSize(64) .setCompression(CompressionKind_NONE) .setMemoryPool(pool) .setRowIndexStride(1000); @@ -2201,7 +2353,59 @@ namespace orc { std::invalid_argument); } - INSTANTIATE_TEST_SUITE_P(OrcTest, WriterTest, - Values(FileVersion::v_0_11(), FileVersion::v_0_12(), - FileVersion::UNSTABLE_PRE_2_0())); + TEST_P(WriterTest, testLazyLoadTZDB) { + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); + MemoryPool* pool = getDefaultPool(); + std::unique_ptr type(Type::buildTypeFromString("struct")); + + uint64_t stripeSize = 1024; // 1K + uint64_t compressionBlockSize = 1024; // 1k + uint64_t memoryBlockSize = 64; + + std::unique_ptr writer = + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion, 0, "/ERROR/TIMEZONE"); + std::unique_ptr batch = writer->createRowBatch(10); + StructVectorBatch* structBatch = dynamic_cast(batch.get()); + LongVectorBatch* longBatch = dynamic_cast(structBatch->fields[0]); + + for (uint64_t j = 0; j < 10; ++j) { + for (uint64_t i = 0; i < 10; ++i) { + longBatch->data[i] = static_cast(i); + } + structBatch->numElements = 10; + longBatch->numElements = 10; + + writer->add(*batch); + } + + writer->close(); + + auto inStream = std::make_unique(memStream.getData(), memStream.getLength()); + std::unique_ptr reader = createReader(pool, std::move(inStream)); + std::unique_ptr rowReader = createRowReader(reader.get(), "/ERROR/TIMEZONE"); + EXPECT_EQ(100, reader->getNumberOfRows()); + + batch = rowReader->createRowBatch(10); + for (uint64_t j = 0; j < 10; ++j) { + EXPECT_TRUE(rowReader->next(*batch)); + EXPECT_EQ(10, batch->numElements); + + for (uint64_t i = 0; i < 10; ++i) { + structBatch = dynamic_cast(batch.get()); + longBatch = dynamic_cast(structBatch->fields[0]); + EXPECT_EQ(i, longBatch->data[i]); + } + } + EXPECT_FALSE(rowReader->next(*batch)); + } + + std::vector testParams = {{FileVersion::v_0_11(), true}, + {FileVersion::v_0_11(), false}, + {FileVersion::v_0_12(), false}, + {FileVersion::v_0_12(), true}, + {FileVersion::UNSTABLE_PRE_2_0(), false}, + {FileVersion::UNSTABLE_PRE_2_0(), true}}; + + INSTANTIATE_TEST_SUITE_P(OrcTest, WriterTest, ::testing::ValuesIn(testParams)); } // namespace orc diff --git a/cmake_modules/CheckFormat.cmake b/cmake_modules/CheckFormat.cmake new file mode 100644 index 0000000000..17017da133 --- /dev/null +++ b/cmake_modules/CheckFormat.cmake @@ -0,0 +1,111 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Needed for linting targets, etc. +# Use the first Python installation on PATH, not the newest one +set(Python3_FIND_STRATEGY "LOCATION") +# On Windows, use registry last, not first +set(Python3_FIND_REGISTRY "LAST") +# On macOS, use framework last, not first +set(Python3_FIND_FRAMEWORK "LAST") + +find_package(Python3) +set(PYTHON_EXECUTABLE ${Python3_EXECUTABLE}) + +set(BUILD_SUPPORT_DIR "${PROJECT_SOURCE_DIR}/c++/build-support") + +find_program(CLANG_FORMAT_BIN + NAMES clang-format-13 + HINTS ${CLANG_SEARCH_PATH}) + +find_program(CLANG_TIDY_BIN + NAMES clang-tidy-13 + HINTS ${CLANG_SEARCH_PATH}) + +find_program(CLANG_APPLY_REPLACEMENTS_BIN + NAMES clang-apply-replacements-13 + HINTS ${CLANG_SEARCH_PATH}) + + +if("${CLANG_FORMAT_BIN}" STREQUAL "CLANG_FORMAT_BIN-NOTFOUND") + message(WARNING "Couldn't find clang-format.") +else() + message(STATUS "Found clang-format at ${CLANG_FORMAT_BIN}") +endif() + +if("${CLANG_TIDY_BIN}" STREQUAL "CLANG_TIDY_BIN-NOTFOUND") + message(WARNING "Couldn't find clang-tidy.") +else() + # Output compile_commands.json + set(CMAKE_EXPORT_COMPILE_COMMANDS 1) + message(STATUS "Found clang-tidy at ${CLANG_TIDY_BIN}") +endif() + +if("${CLANG_APPLY_REPLACEMENTS_BIN}" STREQUAL "CLANG_APPLY_REPLACEMENTS_BIN-NOTFOUND") + message(WARNING "Couldn't find clang-apply-replacements.") +else() + # Output compile_commands.json + set(CMAKE_EXPORT_COMPILE_COMMANDS 1) + message(STATUS "Found clang-apply-replacements at ${CLANG_APPLY_REPLACEMENTS_BIN}") +endif() + +if(NOT LINT_EXCLUSIONS_FILE) + # source files matching a glob from a line in this file + # will be excluded from linting (cpplint, clang-tidy, clang-format) + set(LINT_EXCLUSIONS_FILE ${BUILD_SUPPORT_DIR}/lint_exclusions.txt) +endif() + +# runs clang-tidy and exits with a non-zero exit code if any errors are found. +# note that clang-tidy automatically looks for a .clang-tidy file in parent directories +add_custom_target(check-clang-tidy + ${PYTHON_EXECUTABLE} + ${BUILD_SUPPORT_DIR}/run_clang_tidy.py # run LLVM's clang-tidy script + -clang-tidy-binary ${CLANG_TIDY_BIN} # using our clang-tidy binary + -p ${PROJECT_BINARY_DIR} # using cmake's generated compile commands +) + +add_custom_target(fix-clang-tidy + ${PYTHON_EXECUTABLE} + ${BUILD_SUPPORT_DIR}/run_clang_tidy.py # run LLVM's clang-tidy script + -clang-tidy-binary ${CLANG_TIDY_BIN} # using our clang-tidy binary + -p ${PROJECT_BINARY_DIR} # using cmake's generated compile commands + -clang-apply-replacements-binary ${CLANG_APPLY_REPLACEMENTS_BIN} # using our clang-apply-replacements binary + -fix # apply suggested changes generated by clang-tidy +) + +string(CONCAT ORC_FORMAT_DIRS + "${PROJECT_SOURCE_DIR}/c++," + "${PROJECT_SOURCE_DIR}/tools," +) + +add_custom_target(format + ${PYTHON_EXECUTABLE} + ${BUILD_SUPPORT_DIR}/run_clang_format.py + ${CLANG_FORMAT_BIN} + --source_dirs + ${ORC_FORMAT_DIRS} + --fix +) + +# Runs clang format and exits with a non-zero exit code if any files need to be reformatted +add_custom_target(check-format + ${PYTHON_EXECUTABLE} + ${BUILD_SUPPORT_DIR}/run_clang_format.py + ${CLANG_FORMAT_BIN} + --source_dirs + ${ORC_FORMAT_DIRS} +) \ No newline at end of file diff --git a/cmake_modules/FindLZ4.cmake b/cmake_modules/FindLZ4.cmake index b1557f496b..3b9cc7fbd1 100644 --- a/cmake_modules/FindLZ4.cmake +++ b/cmake_modules/FindLZ4.cmake @@ -22,6 +22,16 @@ # LZ4_STATIC_LIB: path to lz4.a # LZ4_FOUND: whether LZ4 has been found +if (NOT LZ4_HOME) + if (DEFINED ENV{LZ4_HOME}) + set (LZ4_HOME "$ENV{LZ4_HOME}") + elseif (LZ4_ROOT) + set (LZ4_HOME "${LZ4_ROOT}") + elseif (DEFINED ENV{LZ4_ROOT}) + set (LZ4_HOME "$ENV{LZ4_ROOT}") + endif () +endif () + if( NOT "${LZ4_HOME}" STREQUAL "") file (TO_CMAKE_PATH "${LZ4_HOME}" _lz4_path) endif() @@ -33,7 +43,7 @@ find_path (LZ4_INCLUDE_DIR lz4.h HINTS NO_DEFAULT_PATH PATH_SUFFIXES "include") -find_library (LZ4_LIBRARY NAMES lz4 HINTS +find_library (LZ4_LIBRARY NAMES lz4 liblz4 HINTS ${_lz4_path} PATH_SUFFIXES "lib" "lib64") @@ -74,3 +84,10 @@ mark_as_advanced ( LZ4_STATIC_LIB LZ4_LIBRARY ) + +if(LZ4_FOUND AND NOT TARGET LZ4::lz4) + add_library(LZ4::lz4 UNKNOWN IMPORTED) + set_target_properties(LZ4::lz4 + PROPERTIES IMPORTED_LOCATION "${LZ4_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES "${LZ4_INCLUDE_DIR}") +endif() diff --git a/cmake_modules/FindProtobuf.cmake b/cmake_modules/FindProtobuf.cmake index cca7c8b87e..ca91fb5ade 100644 --- a/cmake_modules/FindProtobuf.cmake +++ b/cmake_modules/FindProtobuf.cmake @@ -17,7 +17,7 @@ # PROTOBUF_HOME environmental variable is used to check for Protobuf headers and static library -# PROTOBUF_FOUND is set if Protobuf is found +# Protobuf_FOUND is set if Protobuf is found # PROTOBUF_INCLUDE_DIR: directory containing headers # PROTOBUF_LIBRARY: location of libprotobuf # PROTOBUF_STATIC_LIB: location of protobuf.a @@ -25,6 +25,19 @@ # PROTOC_STATIC_LIB: location of protoc.a # PROTOBUF_EXECUTABLE: location of protoc +if (NOT PROTOBUF_HOME) + if (DEFINED ENV{PROTOBUF_HOME}) + set (PROTOBUF_HOME "$ENV{PROTOBUF_HOME}") + elseif (Protobuf_ROOT) + set (PROTOBUF_HOME "${Protobuf_ROOT}") + elseif (DEFINED ENV{Protobuf_ROOT}) + set (PROTOBUF_HOME "$ENV{Protobuf_ROOT}") + elseif (PROTOBUF_ROOT) + set (PROTOBUF_HOME "${PROTOBUF_ROOT}") + elseif (DEFINED ENV{PROTOBUF_ROOT}) + set (PROTOBUF_HOME "$ENV{PROTOBUF_ROOT}") + endif () +endif () if( NOT "${PROTOBUF_HOME}" STREQUAL "") file (TO_CMAKE_PATH "${PROTOBUF_HOME}" _protobuf_path) @@ -32,8 +45,17 @@ endif() message (STATUS "PROTOBUF_HOME: ${PROTOBUF_HOME}") +if (NOT DEFINED CMAKE_STATIC_LIBRARY_SUFFIX) + if (WIN32) + set (CMAKE_STATIC_LIBRARY_SUFFIX ".lib") + else () + set (CMAKE_STATIC_LIBRARY_SUFFIX ".a") + endif () +endif () + find_package (Protobuf CONFIG) if (Protobuf_FOUND) + if (TARGET protobuf::libprotobuf) set (PROTOBUF_LIBRARY protobuf::libprotobuf) set (PROTOBUF_STATIC_LIB PROTOBUF_STATIC_LIB-NOTFOUND) set (PROTOC_LIBRARY protobuf::libprotoc) @@ -42,15 +64,34 @@ if (Protobuf_FOUND) get_target_property (target_type protobuf::libprotobuf TYPE) if (target_type STREQUAL "STATIC_LIBRARY") - set(PROTOBUF_STATIC_LIB protobuf::libprotobuf) + set (PROTOBUF_STATIC_LIB protobuf::libprotobuf) endif () get_target_property (target_type protobuf::libprotoc TYPE) if (target_type STREQUAL "STATIC_LIBRARY") - set (PROTOC_STATIC_LIB protobuf::libprotoc) + set (PROTOC_STATIC_LIB protobuf::libprotoc) endif () - get_target_property (PROTOBUF_INCLUDE_DIR protobuf::libprotoc INTERFACE_INCLUDE_DIRECTORIES) + get_target_property (PROTOBUF_INCLUDE_DIR protobuf::libprotobuf INTERFACE_INCLUDE_DIRECTORIES) + if (NOT PROTOBUF_INCLUDE_DIR) + set (PROTOBUF_INCLUDE_DIR ${Protobuf_INCLUDE_DIRS}) + if (NOT PROTOBUF_INCLUDE_DIR) + message (FATAL_ERROR "Cannot determine Protobuf include directory from protobuf::libprotobuf and Protobuf_INCLUDE_DIRS.") + endif () + endif () + else () + set (PROTOBUF_LIBRARY ${Protobuf_LIBRARIES}) + set (PROTOBUF_INCLUDE_DIR ${Protobuf_INCLUDE_DIRS}) + if (NOT PROTOBUF_INCLUDE_DIR) + message (FATAL_ERROR "Cannot determine Protobuf include directory.") + endif () + + if (Protobuf_LIBRARIES MATCHES "\\${CMAKE_STATIC_LIBRARY_SUFFIX}$") + set (PROTOBUF_STATIC_LIB ${Protobuf_LIBRARIES}) + else () + set (PROTOBUF_STATIC_LIB PROTOBUF_STATIC_LIB-NOTFOUND) + endif () + endif () else() find_path (PROTOBUF_INCLUDE_DIR google/protobuf/io/zero_copy_stream.h HINTS @@ -63,7 +104,7 @@ else() NO_DEFAULT_PATH PATH_SUFFIXES "include") - find_library (PROTOBUF_LIBRARY NAMES protobuf HINTS + find_library (PROTOBUF_LIBRARY NAMES protobuf libprotobuf HINTS ${_protobuf_path} PATH_SUFFIXES "lib") @@ -71,7 +112,7 @@ else() ${_protobuf_path} PATH_SUFFIXES "lib") - find_library (PROTOC_LIBRARY NAMES protoc HINTS + find_library (PROTOC_LIBRARY NAMES protoc libprotoc HINTS ${_protobuf_path} PATH_SUFFIXES "lib") @@ -86,14 +127,14 @@ else() endif () if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOC_LIBRARY AND PROTOBUF_EXECUTABLE) - set (PROTOBUF_FOUND TRUE) + set (Protobuf_FOUND TRUE) set (PROTOBUF_LIB_NAME protobuf) set (PROTOC_LIB_NAME protoc) else () - set (PROTOBUF_FOUND FALSE) + set (Protobuf_FOUND FALSE) endif () -if (PROTOBUF_FOUND) +if (Protobuf_FOUND) message (STATUS "Found the Protobuf headers: ${PROTOBUF_INCLUDE_DIR}") message (STATUS "Found the Protobuf library: ${PROTOBUF_LIBRARY}") message (STATUS "Found the Protoc library: ${PROTOC_LIBRARY}") @@ -125,3 +166,10 @@ mark_as_advanced ( PROTOC_STATIC_LIB PROTOC_LIBRARY ) + +if(Protobuf_FOUND AND NOT TARGET protobuf::libprotobuf) + add_library(protobuf::libprotobuf UNKNOWN IMPORTED) + set_target_properties(protobuf::libprotobuf + PROPERTIES IMPORTED_LOCATION "${PROTOBUF_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES "${PROTOBUF_INCLUDE_DIR}") +endif() diff --git a/cmake_modules/FindSnappy.cmake b/cmake_modules/FindSnappy.cmake index f0a0773801..1ad9914542 100644 --- a/cmake_modules/FindSnappy.cmake +++ b/cmake_modules/FindSnappy.cmake @@ -20,7 +20,21 @@ # SNAPPY_INCLUDE_DIR: directory containing headers # SNAPPY_LIBRARY: path to libsnappy # SNAPPY_STATIC_LIB: path to libsnappy.a -# SNAPPY_FOUND: whether snappy has been found +# Snappy_FOUND: whether snappy has been found + +if (NOT SNAPPY_HOME) + if (DEFINED ENV{SNAPPY_HOME}) + set (SNAPPY_HOME "$ENV{SNAPPY_HOME}") + elseif (Snappy_ROOT) + set (SNAPPY_HOME "${Snappy_ROOT}") + elseif (DEFINED ENV{Snappy_ROOT}) + set (SNAPPY_HOME "$ENV{Snappy_ROOT}") + elseif (SNAPPY_ROOT) + set (SNAPPY_HOME "${SNAPPY_ROOT}") + elseif (DEFINED ENV{SNAPPY_ROOT}) + set (SNAPPY_HOME "$ENV{SNAPPY_ROOT}") + endif () +endif () if( NOT "${SNAPPY_HOME}" STREQUAL "") file (TO_CMAKE_PATH "${SNAPPY_HOME}" _snappy_path) @@ -42,14 +56,14 @@ find_library (SNAPPY_STATIC_LIB NAMES ${CMAKE_STATIC_LIBRARY_PREFIX}${SNAPPY_LIB PATH_SUFFIXES "lib" "lib64") if (SNAPPY_INCLUDE_DIR AND SNAPPY_LIBRARY) - set (SNAPPY_FOUND TRUE) + set (Snappy_FOUND TRUE) set (SNAPPY_HEADER_NAME snappy.h) set (SNAPPY_HEADER ${SNAPPY_INCLUDE_DIR}/${SNAPPY_HEADER_NAME}) else () - set (SNAPPY_FOUND FALSE) + set (Snappy_FOUND FALSE) endif () -if (SNAPPY_FOUND) +if (Snappy_FOUND) message (STATUS "Found the Snappy header: ${SNAPPY_HEADER}") message (STATUS "Found the Snappy library: ${SNAPPY_LIBRARY}") if (SNAPPY_STATIC_LIB) @@ -74,3 +88,10 @@ mark_as_advanced ( SNAPPY_STATIC_LIB SNAPPY_LIBRARY ) + +if(Snappy_FOUND AND NOT TARGET Snappy::snappy) + add_library(Snappy::snappy UNKNOWN IMPORTED) + set_target_properties(Snappy::snappy + PROPERTIES IMPORTED_LOCATION "${SNAPPY_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES "${SNAPPY_INCLUDE_DIR}") +endif() diff --git a/cmake_modules/FindZLIB.cmake b/cmake_modules/FindZLIB.cmake index 2f83a974cd..374814a7f9 100644 --- a/cmake_modules/FindZLIB.cmake +++ b/cmake_modules/FindZLIB.cmake @@ -22,6 +22,16 @@ # ZLIB_STATIC_LIB: path to zlib.a # ZLIB_FOUND: whether ZLIB has been found +if (NOT ZLIB_HOME) + if (DEFINED ENV{ZLIB_HOME}) + set (ZLIB_HOME "$ENV{ZLIB_HOME}") + elseif (ZLIB_ROOT) + set (ZLIB_HOME "${ZLIB_ROOT}") + elseif (DEFINED ENV{ZLIB_ROOT}) + set (ZLIB_HOME "$ENV{ZLIB_ROOT}") + endif () +endif () + if( NOT "${ZLIB_HOME}" STREQUAL "") file (TO_CMAKE_PATH "${ZLIB_HOME}" _zlib_path) endif() @@ -78,3 +88,10 @@ mark_as_advanced ( ZLIB_STATIC_LIB ZLIB_LIBRARY ) + +if(ZLIB_FOUND AND NOT TARGET ZLIB::ZLIB) + add_library(ZLIB::ZLIB UNKNOWN IMPORTED) + set_target_properties(ZLIB::ZLIB + PROPERTIES IMPORTED_LOCATION "${ZLIB_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES "${ZLIB_INCLUDE_DIR}") +endif() diff --git a/cmake_modules/FindZSTD.cmake b/cmake_modules/FindZSTD.cmake index 7ec197221d..581719453c 100644 --- a/cmake_modules/FindZSTD.cmake +++ b/cmake_modules/FindZSTD.cmake @@ -22,6 +22,16 @@ # ZSTD_STATIC_LIB: path to libzstd.a # ZSTD_FOUND: whether zstd has been found +if (NOT ZSTD_HOME) + if (DEFINED ENV{ZSTD_HOME}) + set (ZSTD_HOME "$ENV{ZSTD_HOME}") + elseif (ZSTD_ROOT) + set (ZSTD_HOME "${ZSTD_ROOT}") + elseif (DEFINED ENV{ZSTD_ROOT}) + set (ZSTD_HOME "$ENV{ZSTD_ROOT}") + endif () +endif () + if( NOT "${ZSTD_HOME}" STREQUAL "") file (TO_CMAKE_PATH "${ZSTD_HOME}" _zstd_path) endif() @@ -74,3 +84,18 @@ mark_as_advanced ( ZSTD_STATIC_LIB ZSTD_LIBRARY ) + +if(ZSTD_FOUND) + if(NOT TARGET zstd::libzstd_static AND ZSTD_STATIC_LIB) + add_library(zstd::libzstd_static STATIC IMPORTED) + set_target_properties(zstd::libzstd_static + PROPERTIES IMPORTED_LOCATION "${ZSTD_STATIC_LIB}" + INTERFACE_INCLUDE_DIRECTORIES "${ZSTD_INCLUDE_DIR}") + endif() + if(NOT TARGET zstd::libzstd_shared AND NOT ZSTD_STATIC_LIB) + add_library(zstd::libzstd_shared SHARED IMPORTED) + set_target_properties(zstd::libzstd_shared + PROPERTIES IMPORTED_LOCATION "${ZSTD_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES "${ZSTD_INCLUDE_DIR}") + endif() +endif() diff --git a/cmake_modules/ThirdpartyToolchain.cmake b/cmake_modules/ThirdpartyToolchain.cmake index ec33193d79..ce1f6c4a9e 100644 --- a/cmake_modules/ThirdpartyToolchain.cmake +++ b/cmake_modules/ThirdpartyToolchain.cmake @@ -15,10 +15,14 @@ # specific language governing permissions and limitations # under the License. -set(ORC_FORMAT_VERSION "1.0.0") -set(LZ4_VERSION "1.9.3") -set(SNAPPY_VERSION "1.1.7") -set(ZLIB_VERSION "1.2.11") +set(ORC_VENDOR_DEPENDENCIES) +set(ORC_SYSTEM_DEPENDENCIES) +set(ORC_INSTALL_INTERFACE_TARGETS) + +set(ORC_FORMAT_VERSION "1.1.0") +set(LZ4_VERSION "1.10.0") +set(SNAPPY_VERSION "1.2.1") +set(ZLIB_VERSION "1.3.1") set(GTEST_VERSION "1.12.1") set(PROTOBUF_VERSION "3.5.1") set(ZSTD_VERSION "1.5.5") @@ -33,7 +37,7 @@ option(ORC_PREFER_STATIC_GMOCK "Prefer static gmock library, if available" # zstd requires us to add the threads FIND_PACKAGE(Threads REQUIRED) -set(THIRDPARTY_DIR "${CMAKE_BINARY_DIR}/c++/libs/thirdparty") +set(THIRDPARTY_DIR "${PROJECT_BINARY_DIR}/c++/libs/thirdparty") set(THIRDPARTY_LOG_OPTIONS LOG_CONFIGURE 1 LOG_BUILD 1 LOG_INSTALL 1 @@ -47,34 +51,101 @@ string(TOUPPER ${CMAKE_BUILD_TYPE} UPPERCASE_BUILD_TYPE) if (DEFINED ENV{SNAPPY_HOME}) set (SNAPPY_HOME "$ENV{SNAPPY_HOME}") +elseif (Snappy_ROOT) + set (SNAPPY_HOME "${Snappy_ROOT}") +elseif (DEFINED ENV{Snappy_ROOT}) + set (SNAPPY_HOME "$ENV{Snappy_ROOT}") +elseif (SNAPPY_ROOT) + set (SNAPPY_HOME "${SNAPPY_ROOT}") +elseif (DEFINED ENV{SNAPPY_ROOT}) + set (SNAPPY_HOME "$ENV{SNAPPY_ROOT}") endif () if (DEFINED ENV{ZLIB_HOME}) set (ZLIB_HOME "$ENV{ZLIB_HOME}") +elseif (ZLIB_ROOT) + set (ZLIB_HOME "${ZLIB_ROOT}") +elseif (DEFINED ENV{ZLIB_ROOT}) + set (ZLIB_HOME "$ENV{ZLIB_ROOT}") endif () if (DEFINED ENV{LZ4_HOME}) set (LZ4_HOME "$ENV{LZ4_HOME}") +elseif (LZ4_ROOT) + set (LZ4_HOME "${LZ4_ROOT}") +elseif (DEFINED ENV{LZ4_ROOT}) + set (LZ4_HOME "$ENV{LZ4_ROOT}") endif () if (DEFINED ENV{PROTOBUF_HOME}) set (PROTOBUF_HOME "$ENV{PROTOBUF_HOME}") +elseif (Protobuf_ROOT) + set (PROTOBUF_HOME "${Protobuf_ROOT}") +elseif (DEFINED ENV{Protobuf_ROOT}) + set (PROTOBUF_HOME "$ENV{Protobuf_ROOT}") +elseif (PROTOBUF_ROOT) + set (PROTOBUF_HOME "${PROTOBUF_ROOT}") +elseif (DEFINED ENV{PROTOBUF_ROOT}) + set (PROTOBUF_HOME "$ENV{PROTOBUF_ROOT}") endif () if (DEFINED ENV{ZSTD_HOME}) set (ZSTD_HOME "$ENV{ZSTD_HOME}") +elseif (ZSTD_ROOT) + set (ZSTD_HOME "${ZSTD_ROOT}") +elseif (DEFINED ENV{ZSTD_ROOT}) + set (ZSTD_HOME "$ENV{ZSTD_ROOT}") endif () if (DEFINED ENV{GTEST_HOME}) set (GTEST_HOME "$ENV{GTEST_HOME}") endif () +# ---------------------------------------------------------------------- +# Macros for adding third-party libraries +macro (orc_add_resolved_library target_name link_lib include_dir) + add_library (${target_name} INTERFACE IMPORTED GLOBAL) + target_link_libraries (${target_name} INTERFACE ${link_lib}) + target_include_directories (${target_name} SYSTEM INTERFACE ${include_dir}) +endmacro () + +macro (orc_add_built_library external_project_name target_name link_lib include_dir) + file (MAKE_DIRECTORY "${include_dir}") + + add_library (${target_name} STATIC IMPORTED) + set_target_properties (${target_name} PROPERTIES IMPORTED_LOCATION "${link_lib}") + target_include_directories (${target_name} BEFORE INTERFACE "${include_dir}") + + add_dependencies (${target_name} ${external_project_name}) + if (INSTALL_VENDORED_LIBS) + install (FILES "${link_lib}" DESTINATION "${CMAKE_INSTALL_LIBDIR}") + endif () +endmacro () + +function(orc_provide_cmake_module MODULE_NAME) + set(module "${PROJECT_SOURCE_DIR}/cmake_modules/${MODULE_NAME}.cmake") + if(EXISTS "${module}") + message(STATUS "Providing CMake module for ${MODULE_NAME} as part of CMake package") + install(FILES "${module}" DESTINATION "${ORC_INSTALL_CMAKE_DIR}") + endif() +endfunction() + +function(orc_provide_find_module PACKAGE_NAME) + orc_provide_cmake_module("Find${PACKAGE_NAME}") +endfunction() + # ---------------------------------------------------------------------- # ORC Format +if(DEFINED ENV{ORC_FORMAT_URL}) + set(ORC_FORMAT_SOURCE_URL "$ENV{ORC_FORMAT_URL}") + message(STATUS "Using ORC_FORMAT_URL: ${ORC_FORMAT_SOURCE_URL}") +else() + set(ORC_FORMAT_SOURCE_URL "https://www.apache.org/dyn/closer.lua/orc/orc-format-$%7BORC_FORMAT_VERSION%7D/orc-format-$%7BORC_FORMAT_VERSION%7D.tar.gz?action=download" ) + message(STATUS "Using DEFAULT URL: ${ORC_FORMAT_SOURCE_URL}") +endif() ExternalProject_Add (orc-format_ep - URL "https://dlcdn.apache.org/orc/orc-format-$%7BORC_FORMAT_VERSION%7D/orc-format-$%7BORC_FORMAT_VERSION%7D.tar.gz" - URL "https://archive.apache.org/dist/orc/orc-format-$%7BORC_FORMAT_VERSION%7D/orc-format-$%7BORC_FORMAT_VERSION%7D.tar.gz" - URL_HASH SHA256=739fae5ff94b1f812b413077280361045bf92e510ef04b34a610e23a945d8cd5 + URL ${ORC_FORMAT_SOURCE_URL} + URL_HASH SHA256=d4a7ac76c5442abf7119e2cb84e71b677e075aff53518aa866055e2ead0450d7 CONFIGURE_COMMAND "" BUILD_COMMAND "" INSTALL_COMMAND "" @@ -83,16 +154,36 @@ ExternalProject_Add (orc-format_ep # ---------------------------------------------------------------------- # Snappy - -if (NOT "${SNAPPY_HOME}" STREQUAL "" OR ORC_PACKAGE_KIND STREQUAL "conan") +if (ORC_PACKAGE_KIND STREQUAL "conan") + find_package (Snappy REQUIRED CONFIG) + add_library (orc_snappy INTERFACE) + target_link_libraries(orc_snappy INTERFACE Snappy::snappy) + list (APPEND ORC_SYSTEM_DEPENDENCIES Snappy) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") +elseif (ORC_PACKAGE_KIND STREQUAL "vcpkg") + find_package(Snappy CONFIG REQUIRED) + add_library (orc_snappy INTERFACE IMPORTED) + target_link_libraries(orc_snappy INTERFACE Snappy::snappy) + list (APPEND ORC_SYSTEM_DEPENDENCIES Snappy) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") +elseif (NOT "${SNAPPY_HOME}" STREQUAL "") find_package (Snappy REQUIRED) - set(SNAPPY_VENDORED FALSE) + if (ORC_PREFER_STATIC_SNAPPY AND SNAPPY_STATIC_LIB) + orc_add_resolved_library (orc_snappy ${SNAPPY_STATIC_LIB} ${SNAPPY_INCLUDE_DIR}) + else () + orc_add_resolved_library (orc_snappy ${SNAPPY_LIBRARY} ${SNAPPY_INCLUDE_DIR}) + endif () + list (APPEND ORC_SYSTEM_DEPENDENCIES Snappy) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") + orc_provide_find_module (Snappy) else () set(SNAPPY_HOME "${THIRDPARTY_DIR}/snappy_ep-install") set(SNAPPY_INCLUDE_DIR "${SNAPPY_HOME}/include") - set(SNAPPY_STATIC_LIB "${SNAPPY_HOME}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}snappy${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(SNAPPY_STATIC_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}snappy${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(SNAPPY_STATIC_LIB "${SNAPPY_HOME}/lib/${SNAPPY_STATIC_LIB_NAME}") set(SNAPPY_CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${SNAPPY_HOME} - -DBUILD_SHARED_LIBS=OFF -DCMAKE_INSTALL_LIBDIR=lib) + -DBUILD_SHARED_LIBS=OFF -DCMAKE_INSTALL_LIBDIR=lib + -DSNAPPY_BUILD_BENCHMARKS=OFF) if (BUILD_POSITION_INDEPENDENT_LIB) set(SNAPPY_CMAKE_ARGS ${SNAPPY_CMAKE_ARGS} -DCMAKE_POSITION_INDEPENDENT_CODE=ON) @@ -104,39 +195,39 @@ else () ${THIRDPARTY_LOG_OPTIONS} BUILD_BYPRODUCTS "${SNAPPY_STATIC_LIB}") - set(SNAPPY_LIBRARY ${SNAPPY_STATIC_LIB}) - set(SNAPPY_VENDORED TRUE) -endif () + orc_add_built_library (snappy_ep orc_snappy ${SNAPPY_STATIC_LIB} ${SNAPPY_INCLUDE_DIR}) -add_library (orc_snappy INTERFACE) -add_library (orc::snappy ALIAS orc_snappy) -if (ORC_PACKAGE_KIND STREQUAL "conan") - target_link_libraries(orc_snappy INTERFACE ${Snappy_LIBRARIES}) -elseif (ORC_PREFER_STATIC_SNAPPY AND ${SNAPPY_STATIC_LIB}) - target_link_libraries(orc_snappy INTERFACE ${SNAPPY_STATIC_LIB}) -else () - target_link_libraries(orc_snappy INTERFACE ${SNAPPY_LIBRARY}) -endif () -if (ORC_PACKAGE_KIND STREQUAL "conan") - target_include_directories (orc_snappy SYSTEM INTERFACE ${Snappy_INCLUDE_DIR}) -else() - target_include_directories (orc_snappy SYSTEM INTERFACE ${SNAPPY_INCLUDE_DIR}) + list (APPEND ORC_VENDOR_DEPENDENCIES "orc::vendored_snappy|${SNAPPY_STATIC_LIB_NAME}") + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") endif () -if (SNAPPY_VENDORED) - add_dependencies (orc_snappy snappy_ep) - if (INSTALL_VENDORED_LIBS) - install(FILES "${SNAPPY_STATIC_LIB}" - DESTINATION "lib") - endif () -endif () +add_library (orc::snappy ALIAS orc_snappy) # ---------------------------------------------------------------------- # ZLIB -if (NOT "${ZLIB_HOME}" STREQUAL "" OR ORC_PACKAGE_KIND STREQUAL "conan") +if (ORC_PACKAGE_KIND STREQUAL "conan") + find_package (ZLIB REQUIRED CONFIG) + add_library (orc_zlib INTERFACE) + target_link_libraries(orc_zlib INTERFACE ZLIB::ZLIB) + list (APPEND ORC_SYSTEM_DEPENDENCIES ZLIB) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") +elseif (ORC_PACKAGE_KIND STREQUAL "vcpkg") + find_package(ZLIB REQUIRED) + add_library (orc_zlib INTERFACE IMPORTED) + target_link_libraries(orc_zlib INTERFACE ZLIB::ZLIB) + list (APPEND ORC_SYSTEM_DEPENDENCIES ZLIB) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") +elseif (NOT "${ZLIB_HOME}" STREQUAL "") find_package (ZLIB REQUIRED) - set(ZLIB_VENDORED FALSE) + if (ORC_PREFER_STATIC_ZLIB AND ZLIB_STATIC_LIB) + orc_add_resolved_library (orc_zlib ${ZLIB_STATIC_LIB} ${ZLIB_INCLUDE_DIR}) + else () + orc_add_resolved_library (orc_zlib ${ZLIB_LIBRARY} ${ZLIB_INCLUDE_DIR}) + endif () + list (APPEND ORC_SYSTEM_DEPENDENCIES ZLIB) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") + orc_provide_find_module (ZLIB) else () set(ZLIB_PREFIX "${THIRDPARTY_DIR}/zlib_ep-install") set(ZLIB_INCLUDE_DIR "${ZLIB_PREFIX}/include") @@ -148,7 +239,8 @@ else () else () set(ZLIB_STATIC_LIB_NAME z) endif () - set(ZLIB_STATIC_LIB "${ZLIB_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}${ZLIB_STATIC_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(ZLIB_STATIC_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}${ZLIB_STATIC_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(ZLIB_STATIC_LIB "${ZLIB_PREFIX}/lib/${ZLIB_STATIC_LIB_NAME}") set(ZLIB_CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${ZLIB_PREFIX} -DBUILD_SHARED_LIBS=OFF) @@ -162,35 +254,43 @@ else () ${THIRDPARTY_LOG_OPTIONS} BUILD_BYPRODUCTS "${ZLIB_STATIC_LIB}") - set(ZLIB_LIBRARY ${ZLIB_STATIC_LIB}) - set(ZLIB_VENDORED TRUE) -endif () + orc_add_built_library (zlib_ep orc_zlib ${ZLIB_STATIC_LIB} ${ZLIB_INCLUDE_DIR}) -add_library (orc_zlib INTERFACE) -add_library (orc::zlib ALIAS orc_zlib) -if (ORC_PACKAGE_KIND STREQUAL "conan") - target_link_libraries (orc_zlib INTERFACE ${ZLIB_LIBRARIES}) -elseif (ORC_PREFER_STATIC_ZLIB AND ${ZLIB_STATIC_LIB}) - target_link_libraries (orc_zlib INTERFACE ${ZLIB_STATIC_LIB}) -else () - target_link_libraries (orc_zlib INTERFACE ${ZLIB_LIBRARY}) + list (APPEND ORC_VENDOR_DEPENDENCIES "orc::vendored_zlib|${ZLIB_STATIC_LIB_NAME}") + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") endif () -target_include_directories (orc_zlib SYSTEM INTERFACE ${ZLIB_INCLUDE_DIR}) -if (ZLIB_VENDORED) - add_dependencies (orc_zlib zlib_ep) - if (INSTALL_VENDORED_LIBS) - install(FILES "${ZLIB_STATIC_LIB}" - DESTINATION "lib") - endif () -endif () +add_library (orc::zlib ALIAS orc_zlib) # ---------------------------------------------------------------------- # Zstd -if (NOT "${ZSTD_HOME}" STREQUAL "" OR ORC_PACKAGE_KIND STREQUAL "conan") +if (ORC_PACKAGE_KIND STREQUAL "conan") + find_package (ZSTD REQUIRED CONFIG) + add_library (orc_zstd INTERFACE) + target_link_libraries (orc_zstd INTERFACE + $ + $ + ) + list (APPEND ORC_SYSTEM_DEPENDENCIES ZSTD) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$,zstd::libzstd_shared,zstd::libzstd_static>>") +elseif (ORC_PACKAGE_KIND STREQUAL "vcpkg") + find_package(zstd CONFIG REQUIRED) + add_library(orc_zstd INTERFACE) + target_link_libraries(orc_zstd INTERFACE $,zstd::libzstd_shared,zstd::libzstd_static>) + list(APPEND ORC_SYSTEM_DEPENDENCIES zstd) + list(APPEND ORC_INSTALL_INTERFACE_TARGETS "$,zstd::libzstd_shared,zstd::libzstd_static>>") +elseif (NOT "${ZSTD_HOME}" STREQUAL "") find_package (ZSTD REQUIRED) - set(ZSTD_VENDORED FALSE) + if (ORC_PREFER_STATIC_ZSTD AND ZSTD_STATIC_LIB) + orc_add_resolved_library (orc_zstd ${ZSTD_STATIC_LIB} ${ZSTD_INCLUDE_DIR}) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") + else () + orc_add_resolved_library (orc_zstd ${ZSTD_LIBRARY} ${ZSTD_INCLUDE_DIR}) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$,zstd::libzstd_shared,zstd::libzstd_static>>") + endif () + list (APPEND ORC_SYSTEM_DEPENDENCIES ZSTD) + orc_provide_find_module (ZSTD) else () set(ZSTD_HOME "${THIRDPARTY_DIR}/zstd_ep-install") set(ZSTD_INCLUDE_DIR "${ZSTD_HOME}/include") @@ -202,7 +302,8 @@ else () else () set(ZSTD_STATIC_LIB_NAME zstd) endif () - set(ZSTD_STATIC_LIB "${ZSTD_HOME}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}${ZSTD_STATIC_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(ZSTD_STATIC_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}${ZSTD_STATIC_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(ZSTD_STATIC_LIB "${ZSTD_HOME}/lib/${ZSTD_STATIC_LIB_NAME}") set(ZSTD_CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${ZSTD_HOME} -DBUILD_SHARED_LIBS=OFF -DCMAKE_INSTALL_LIBDIR=lib) @@ -223,43 +324,46 @@ else () ${THIRDPARTY_LOG_OPTIONS} BUILD_BYPRODUCTS ${ZSTD_STATIC_LIB}) - set(ZSTD_LIBRARY ${ZSTD_STATIC_LIB}) - set(ZSTD_VENDORED TRUE) -endif () + orc_add_built_library (zstd_ep orc_zstd ${ZSTD_STATIC_LIB} ${ZSTD_INCLUDE_DIR}) -add_library (orc_zstd INTERFACE) -add_library (orc::zstd ALIAS orc_zstd) -if (ORC_PACKAGE_KIND STREQUAL "conan") - target_link_libraries (orc_zstd INTERFACE ${zstd_LIBRARIES}) -elseif (ORC_PREFER_STATIC_ZSTD AND ${ZSTD_STATIC_LIB}) - target_link_libraries (orc_zstd INTERFACE ${ZSTD_STATIC_LIB}) -else () - target_link_libraries (orc_zstd INTERFACE ${ZSTD_LIBRARY}) -endif () -if (ORC_PACKAGE_KIND STREQUAL "conan") - target_include_directories (orc_zstd SYSTEM INTERFACE ${zstd_INCLUDE_DIR}) -else() - target_include_directories (orc_zstd SYSTEM INTERFACE ${ZSTD_INCLUDE_DIR}) + list (APPEND ORC_VENDOR_DEPENDENCIES "orc::vendored_zstd|${ZSTD_STATIC_LIB_NAME}") + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") endif () -if (ZSTD_VENDORED) - add_dependencies (orc_zstd zstd_ep) - if (INSTALL_VENDORED_LIBS) - install(FILES "${ZSTD_STATIC_LIB}" - DESTINATION "lib") - endif () -endif () +add_library (orc::zstd ALIAS orc_zstd) # ---------------------------------------------------------------------- # LZ4 - -if (NOT "${LZ4_HOME}" STREQUAL "" OR ORC_PACKAGE_KIND STREQUAL "conan") +if (ORC_PACKAGE_KIND STREQUAL "conan") + find_package (LZ4 REQUIRED CONFIG) + add_library (orc_lz4 INTERFACE) + target_link_libraries (orc_lz4 INTERFACE + $ + $ + ) + list (APPEND ORC_SYSTEM_DEPENDENCIES LZ4) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$,LZ4::lz4_shared,LZ4::lz4_static>>") +elseif (ORC_PACKAGE_KIND STREQUAL "vcpkg") + find_package(lz4 CONFIG REQUIRED) + add_library (orc_lz4 INTERFACE IMPORTED) + target_link_libraries(orc_lz4 INTERFACE LZ4::lz4) + list (APPEND ORC_SYSTEM_DEPENDENCIES lz4) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") +elseif (NOT "${LZ4_HOME}" STREQUAL "") find_package (LZ4 REQUIRED) - set(LZ4_VENDORED FALSE) + if (ORC_PREFER_STATIC_LZ4 AND LZ4_STATIC_LIB) + orc_add_resolved_library (orc_lz4 ${LZ4_STATIC_LIB} ${LZ4_INCLUDE_DIR}) + else () + orc_add_resolved_library (orc_lz4 ${LZ4_LIBRARY} ${LZ4_INCLUDE_DIR}) + endif () + list (APPEND ORC_SYSTEM_DEPENDENCIES LZ4) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") + orc_provide_find_module (LZ4) else () set(LZ4_PREFIX "${THIRDPARTY_DIR}/lz4_ep-install") set(LZ4_INCLUDE_DIR "${LZ4_PREFIX}/include") - set(LZ4_STATIC_LIB "${LZ4_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}lz4${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(LZ4_STATIC_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}lz4${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(LZ4_STATIC_LIB "${LZ4_PREFIX}/lib/${LZ4_STATIC_LIB_NAME}") set(LZ4_CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${LZ4_PREFIX} -DCMAKE_INSTALL_LIBDIR=lib -DBUILD_SHARED_LIBS=OFF) @@ -281,32 +385,13 @@ else () ${THIRDPARTY_LOG_OPTIONS} BUILD_BYPRODUCTS ${LZ4_STATIC_LIB}) - set(LZ4_LIBRARY ${LZ4_STATIC_LIB}) - set(LZ4_VENDORED TRUE) -endif () + orc_add_built_library (lz4_ep orc_lz4 ${LZ4_STATIC_LIB} ${LZ4_INCLUDE_DIR}) -add_library (orc_lz4 INTERFACE) -add_library (orc::lz4 ALIAS orc_lz4) -if (ORC_PACKAGE_KIND STREQUAL "conan") - target_link_libraries (orc_lz4 INTERFACE ${lz4_LIBRARIES}) -elseif (ORC_PREFER_STATIC_LZ4 AND ${LZ4_STATIC_LIB}) - target_link_libraries (orc_lz4 INTERFACE ${LZ4_STATIC_LIB}) -else () - target_link_libraries (orc_lz4 INTERFACE ${LZ4_LIBRARY}) -endif () -if (ORC_PACKAGE_KIND STREQUAL "conan") - target_include_directories (orc_lz4 SYSTEM INTERFACE ${lz4_INCLUDE_DIR}) -else() - target_include_directories (orc_lz4 SYSTEM INTERFACE ${LZ4_INCLUDE_DIR}) + list (APPEND ORC_VENDOR_DEPENDENCIES "orc::vendored_lz4|${LZ4_STATIC_LIB_NAME}") + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") endif () -if (LZ4_VENDORED) - add_dependencies (orc_lz4 lz4_ep) - if (INSTALL_VENDORED_LIBS) - install(FILES "${LZ4_STATIC_LIB}" - DESTINATION "lib") - endif () -endif () +add_library (orc::lz4 ALIAS orc_lz4) # ---------------------------------------------------------------------- # IANA - Time Zone Database @@ -390,7 +475,7 @@ if (BUILD_CPP_TESTS) add_library (orc::gmock ALIAS orc_gmock) add_library (orc_gtest INTERFACE) add_library (orc::gtest ALIAS orc_gtest) - if (ORC_PREFER_STATIC_GMOCK AND ${GMOCK_STATIC_LIB}) + if (ORC_PREFER_STATIC_GMOCK AND GMOCK_STATIC_LIB) target_link_libraries (orc_gmock INTERFACE ${GMOCK_STATIC_LIB}) target_link_libraries (orc_gtest INTERFACE ${GTEST_STATIC_LIB}) else () @@ -414,9 +499,37 @@ endif () # ---------------------------------------------------------------------- # Protobuf -if (NOT "${PROTOBUF_HOME}" STREQUAL "" OR ORC_PACKAGE_KIND STREQUAL "conan") +if (ORC_PACKAGE_KIND STREQUAL "conan") + find_package (Protobuf REQUIRED CONFIG) + add_library (orc_protobuf INTERFACE) + target_link_libraries(orc_protobuf INTERFACE protobuf::protobuf) + list (APPEND ORC_SYSTEM_DEPENDENCIES Protobuf) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") +elseif (ORC_PACKAGE_KIND STREQUAL "vcpkg") + find_package(Protobuf CONFIG REQUIRED) + add_library (orc_protobuf INTERFACE IMPORTED) + target_link_libraries(orc_protobuf INTERFACE protobuf::libprotobuf) + list (APPEND ORC_SYSTEM_DEPENDENCIES Protobuf) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") + set (PROTOBUF_EXECUTABLE protobuf::protoc) +elseif (NOT "${PROTOBUF_HOME}" STREQUAL "") find_package (Protobuf REQUIRED) - set(PROTOBUF_VENDORED FALSE) + + if (ORC_PREFER_STATIC_PROTOBUF AND PROTOBUF_STATIC_LIB) + orc_add_resolved_library (orc_protobuf ${PROTOBUF_STATIC_LIB} ${PROTOBUF_INCLUDE_DIR}) + else () + orc_add_resolved_library (orc_protobuf ${PROTOBUF_LIBRARY} ${PROTOBUF_INCLUDE_DIR}) + endif () + + if (ORC_PREFER_STATIC_PROTOBUF AND PROTOC_STATIC_LIB) + orc_add_resolved_library (orc_protoc ${PROTOC_STATIC_LIB} ${PROTOBUF_INCLUDE_DIR}) + else () + orc_add_resolved_library (orc_protoc ${PROTOC_LIBRARY} ${PROTOBUF_INCLUDE_DIR}) + endif () + + list (APPEND ORC_SYSTEM_DEPENDENCIES Protobuf) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") + orc_provide_find_module (Protobuf) else () set(PROTOBUF_PREFIX "${THIRDPARTY_DIR}/protobuf_ep-install") set(PROTOBUF_INCLUDE_DIR "${PROTOBUF_PREFIX}/include") @@ -436,7 +549,8 @@ else () else () set(PROTOBUF_STATIC_LIB_PREFIX ${CMAKE_STATIC_LIBRARY_PREFIX}) endif () - set(PROTOBUF_STATIC_LIB "${PROTOBUF_PREFIX}/lib/${PROTOBUF_STATIC_LIB_PREFIX}protobuf${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(PROTOBUF_STATIC_LIB_NAME "${PROTOBUF_STATIC_LIB_PREFIX}protobuf${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(PROTOBUF_STATIC_LIB "${PROTOBUF_PREFIX}/lib/${PROTOBUF_STATIC_LIB_NAME}") set(PROTOC_STATIC_LIB "${PROTOBUF_PREFIX}/lib/${PROTOBUF_STATIC_LIB_PREFIX}protoc${CMAKE_STATIC_LIBRARY_SUFFIX}") set(PROTOBUF_EXECUTABLE "${PROTOBUF_PREFIX}/bin/protoc${CMAKE_EXECUTABLE_SUFFIX}") @@ -453,45 +567,16 @@ else () ${THIRDPARTY_LOG_OPTIONS} BUILD_BYPRODUCTS "${PROTOBUF_STATIC_LIB}" "${PROTOC_STATIC_LIB}") - set(PROTOBUF_LIBRARY ${PROTOBUF_STATIC_LIB}) - set(PROTOC_LIBRARY ${PROTOC_STATIC_LIB}) - set(PROTOBUF_VENDORED TRUE) -endif () - -add_library (orc_protobuf INTERFACE) -add_library (orc::protobuf ALIAS orc_protobuf) -add_library (orc_protoc INTERFACE) -add_library (orc::protoc ALIAS orc_protoc) + orc_add_built_library (protobuf_ep orc_protobuf ${PROTOBUF_STATIC_LIB} ${PROTOBUF_INCLUDE_DIR}) + orc_add_built_library (protobuf_ep orc_protoc ${PROTOC_STATIC_LIB} ${PROTOBUF_INCLUDE_DIR}) -if (ORC_PACKAGE_KIND STREQUAL "conan") - target_link_libraries (orc_protobuf INTERFACE ${protobuf_LIBRARIES}) -elseif (ORC_PREFER_STATIC_PROTOBUF AND ${PROTOBUF_STATIC_LIB}) - target_link_libraries (orc_protobuf INTERFACE ${PROTOBUF_STATIC_LIB}) -else () - target_link_libraries (orc_protobuf INTERFACE ${PROTOBUF_LIBRARY}) -endif() -if (ORC_PACKAGE_KIND STREQUAL "conan") - target_include_directories (orc_protobuf SYSTEM INTERFACE ${protobuf_INCLUDE_DIR}) -else () - target_include_directories (orc_protobuf SYSTEM INTERFACE ${PROTOBUF_INCLUDE_DIR}) + list (APPEND ORC_VENDOR_DEPENDENCIES "orc::vendored_protobuf|${PROTOBUF_STATIC_LIB_NAME}") + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") endif () -if (NOT ORC_PACKAGE_KIND STREQUAL "conan") - if (ORC_PREFER_STATIC_PROTOBUF AND ${PROTOC_STATIC_LIB}) - target_link_libraries (orc_protoc INTERFACE ${PROTOC_STATIC_LIB}) - else () - target_link_libraries (orc_protoc INTERFACE ${PROTOC_LIBRARY}) - endif() - target_include_directories (orc_protoc SYSTEM INTERFACE ${PROTOBUF_INCLUDE_DIR}) -endif() - -if (PROTOBUF_VENDORED) - add_dependencies (orc_protoc protobuf_ep) - add_dependencies (orc_protobuf protobuf_ep) - if (INSTALL_VENDORED_LIBS) - install(FILES "${PROTOBUF_STATIC_LIB}" "${PROTOC_STATIC_LIB}" - DESTINATION "lib") - endif () +add_library (orc::protobuf ALIAS orc_protobuf) +if (NOT (ORC_PACKAGE_KIND STREQUAL "conan" OR ORC_PACKAGE_KIND STREQUAL "vcpkg")) + add_library (orc::protoc ALIAS orc_protoc) endif () # ---------------------------------------------------------------------- @@ -509,7 +594,7 @@ if(BUILD_LIBHDFSPP) set (LIBHDFSPP_INCLUDE_DIR "${LIBHDFSPP_PREFIX}/include") set (LIBHDFSPP_STATIC_LIB_NAME hdfspp_static) set (LIBHDFSPP_STATIC_LIB "${LIBHDFSPP_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}${LIBHDFSPP_STATIC_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}") - set (LIBHDFSPP_SRC_URL "${CMAKE_SOURCE_DIR}/c++/libs/libhdfspp/libhdfspp.tar.gz") + set (LIBHDFSPP_SRC_URL "${PROJECT_SOURCE_DIR}/c++/libs/libhdfspp/libhdfspp.tar.gz") set (LIBHDFSPP_CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${LIBHDFSPP_PREFIX} -DPROTOBUF_INCLUDE_DIR=${PROTOBUF_INCLUDE_DIR} @@ -536,15 +621,7 @@ if(BUILD_LIBHDFSPP) BUILD_BYPRODUCTS "${LIBHDFSPP_STATIC_LIB}" CMAKE_ARGS ${LIBHDFSPP_CMAKE_ARGS}) - include_directories (SYSTEM ${LIBHDFSPP_INCLUDE_DIR}) - - add_library (libhdfspp STATIC IMPORTED) - set_target_properties (libhdfspp PROPERTIES IMPORTED_LOCATION ${LIBHDFSPP_STATIC_LIB}) - add_dependencies (libhdfspp libhdfspp_ep) - if (INSTALL_VENDORED_LIBS) - install(FILES "${LIBHDFSPP_STATIC_LIB}" - DESTINATION "lib") - endif () + orc_add_built_library(libhdfspp_ep libhdfspp ${LIBHDFSPP_STATIC_LIB} ${LIBHDFSPP_INCLUDE_DIR}) set (LIBHDFSPP_LIBRARIES libhdfspp diff --git a/docker/README.md b/docker/README.md index e9a3b65b12..b89fed84ab 100644 --- a/docker/README.md +++ b/docker/README.md @@ -2,8 +2,9 @@ * Debian 11 and 12 * Fedora 37 -* Ubuntu 20, 22, 24 +* Ubuntu 22 and 24 * Oracle Linux 9 +* Amazon Linux 2023 ## Pre-built Images diff --git a/docker/ubuntu20/Dockerfile b/docker/amazonlinux23/Dockerfile similarity index 60% rename from docker/ubuntu20/Dockerfile rename to docker/amazonlinux23/Dockerfile index 59a487bb8d..806a58f898 100644 --- a/docker/ubuntu20/Dockerfile +++ b/docker/amazonlinux23/Dockerfile @@ -14,43 +14,35 @@ # See the License for the specific language governing permissions and # limitations under the License. -# ORC compile for Ubuntu 20 +# ORC compile for Amazon Linux 2023 # -FROM ubuntu:20.04 -LABEL maintainer="Apache ORC project " -ARG jdk=17 -ARG cc=gcc +FROM amazonlinux:2023 +LABEL org.opencontainers.image.authors="Apache ORC project " +LABEL org.opencontainers.image.licenses="Apache-2.0" +LABEL org.opencontainers.image.ref.name="Apache ORC on Amazon Linux 2023" +LABEL org.opencontainers.image.version="" -RUN ln -fs /usr/share/zoneinfo/America/Los_Angeles /etc/localtime -RUN apt-get update -RUN apt-get install -y \ - cmake \ +RUN yum check-update || true +RUN yum install -y \ + cmake3 \ + curl-devel \ + cyrus-sasl-devel \ + expat-devel \ + gcc \ + gcc-c++ \ + gettext-devel \ git \ - libsasl2-dev \ - libssl-dev \ + libtool \ make \ - curl \ - maven \ - openjdk-${jdk}-jdk \ - tzdata; \ - if [ "${cc}" = "gcc" ] ; then \ - apt-get install -y \ - gcc \ - g++ \ - ; else \ - apt-get install -y \ - clang \ - && \ - update-alternatives --set cc /usr/bin/clang && \ - update-alternatives --set c++ /usr/bin/clang++ \ - ; fi -RUN update-alternatives --set java $(update-alternatives --list java | grep ${jdk}) && \ - update-alternatives --set javac $(update-alternatives --list javac | grep ${jdk}) - -ENV CC=cc -ENV CXX=c++ + openssl-devel \ + tar \ + wget \ + which \ + zlib-devel \ + java-17-amazon-corretto-devel +ENV TZ=America/Los_Angeles WORKDIR /root VOLUME /root/.m2/repository diff --git a/docker/debian11/Dockerfile b/docker/debian11/Dockerfile index fb804a316b..7af433de18 100644 --- a/docker/debian11/Dockerfile +++ b/docker/debian11/Dockerfile @@ -18,7 +18,10 @@ # FROM debian:bullseye -LABEL maintainer="Apache ORC project " +LABEL org.opencontainers.image.authors="Apache ORC project " +LABEL org.opencontainers.image.licenses="Apache-2.0" +LABEL org.opencontainers.image.ref.name="Apache ORC on Debian 11" +LABEL org.opencontainers.image.version="" ARG jdk=17 RUN apt-get update diff --git a/docker/debian12/Dockerfile b/docker/debian12/Dockerfile index f0c2a600eb..ae341183f9 100644 --- a/docker/debian12/Dockerfile +++ b/docker/debian12/Dockerfile @@ -18,7 +18,10 @@ # FROM debian:bookworm -LABEL maintainer="Apache ORC project " +LABEL org.opencontainers.image.authors="Apache ORC project " +LABEL org.opencontainers.image.licenses="Apache-2.0" +LABEL org.opencontainers.image.ref.name="Apache ORC on Debian 12" +LABEL org.opencontainers.image.version="" ARG jdk=17 RUN apt-get update diff --git a/docker/fedora37/Dockerfile b/docker/fedora37/Dockerfile index bf4a50fc1c..a2d0748ded 100644 --- a/docker/fedora37/Dockerfile +++ b/docker/fedora37/Dockerfile @@ -14,11 +14,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -# ORC compile for CentOS 7 +# ORC compile for Fedora Linux 37 # FROM fedora:37 -LABEL maintainer="Apache ORC project " +LABEL org.opencontainers.image.authors="Apache ORC project " +LABEL org.opencontainers.image.licenses="Apache-2.0" +LABEL org.opencontainers.image.ref.name="Apache ORC on Fedora Linux 37" +LABEL org.opencontainers.image.version="" RUN yum check-update || true RUN yum install -y \ diff --git a/docker/oraclelinux9/Dockerfile b/docker/oraclelinux9/Dockerfile index 094ec828f2..a0f9623490 100644 --- a/docker/oraclelinux9/Dockerfile +++ b/docker/oraclelinux9/Dockerfile @@ -18,7 +18,8 @@ # FROM oraclelinux:9 -LABEL maintainer="Apache ORC project " +LABEL org.opencontainers.image.authors="Apache ORC project " +LABEL org.opencontainers.image.licenses="Apache-2.0" RUN yum check-update || true RUN yum install -y \ diff --git a/docker/os-list.txt b/docker/os-list.txt index 3966df3245..e4a288bd1c 100644 --- a/docker/os-list.txt +++ b/docker/os-list.txt @@ -1,7 +1,7 @@ debian11 debian12 -ubuntu20 ubuntu22 ubuntu24 fedora37 oraclelinux9 +amazonlinux23 diff --git a/docker/ubuntu22/Dockerfile b/docker/ubuntu22/Dockerfile index 81f6269518..03863f20a4 100644 --- a/docker/ubuntu22/Dockerfile +++ b/docker/ubuntu22/Dockerfile @@ -18,7 +18,10 @@ # FROM ubuntu:22.04 -LABEL maintainer="Apache ORC project " +LABEL org.opencontainers.image.authors="Apache ORC project " +LABEL org.opencontainers.image.licenses="Apache-2.0" +LABEL org.opencontainers.image.ref.name="Apache ORC on Ubuntu 22" +LABEL org.opencontainers.image.version="" ARG jdk=17 ARG cc=gcc diff --git a/docker/ubuntu24/Dockerfile b/docker/ubuntu24/Dockerfile index 34b3924330..00cd2d67e7 100644 --- a/docker/ubuntu24/Dockerfile +++ b/docker/ubuntu24/Dockerfile @@ -18,7 +18,10 @@ # FROM ubuntu:24.04 -LABEL maintainer="Apache ORC project " +LABEL org.opencontainers.image.authors="Apache ORC project " +LABEL org.opencontainers.image.licenses="Apache-2.0" +LABEL org.opencontainers.image.ref.name="Apache ORC on Ubuntu 24" +LABEL org.opencontainers.image.version="" ARG jdk=21 ARG cc=gcc diff --git a/java/.mvn/jvm.config b/java/.mvn/jvm.config new file mode 100644 index 0000000000..81b88d8173 --- /dev/null +++ b/java/.mvn/jvm.config @@ -0,0 +1 @@ +--enable-native-access=ALL-UNNAMED diff --git a/java/bench/core/pom.xml b/java/bench/core/pom.xml index cf6fe1ad51..3965021c07 100644 --- a/java/bench/core/pom.xml +++ b/java/bench/core/pom.xml @@ -17,7 +17,7 @@ org.apache.orc orc-benchmarks - 2.1.0-SNAPSHOT + 2.2.0-SNAPSHOT .. diff --git a/java/bench/core/src/java/org/apache/orc/bench/core/convert/avro/AvroReader.java b/java/bench/core/src/java/org/apache/orc/bench/core/convert/avro/AvroReader.java index 97b58a8fea..8474351f2b 100644 --- a/java/bench/core/src/java/org/apache/orc/bench/core/convert/avro/AvroReader.java +++ b/java/bench/core/src/java/org/apache/orc/bench/core/convert/avro/AvroReader.java @@ -41,6 +41,7 @@ import org.apache.orc.bench.core.convert.BatchReader; import java.io.IOException; +import java.math.BigInteger; import java.nio.ByteBuffer; import java.util.List; @@ -201,7 +202,14 @@ public void convert(ColumnVector cv, int row, Object value) { cv.isNull[row] = true; } else { DecimalColumnVector tc = (DecimalColumnVector) cv; - tc.vector[row].set(HiveDecimal.create(Math.round((double) value * multiplier))); + if (value instanceof ByteBuffer) { + tc.vector[row].set(getHiveDecimalFromByteBuffer((ByteBuffer) value, scale)); + } else if (value instanceof GenericData.Fixed) { + tc.vector[row].set(getHiveDecimalFromByteBuffer( + ByteBuffer.wrap(((GenericData.Fixed) value).bytes()), scale)); + } else { + tc.vector[row].set(HiveDecimal.create(Math.round((double) value * multiplier))); + } } } } @@ -289,6 +297,13 @@ static AvroConverter createConverter(TypeDescription types) { } } + static HiveDecimal getHiveDecimalFromByteBuffer(ByteBuffer byteBuffer, + int scale) { + byte[] result = getBytesFromByteBuffer(byteBuffer); + HiveDecimal dec = HiveDecimal.create(new BigInteger(result), scale); + return dec; + } + static byte[] getBytesFromByteBuffer(ByteBuffer byteBuffer) { byteBuffer.rewind(); byte[] result = new byte[byteBuffer.limit()]; diff --git a/java/bench/core/src/java/org/apache/orc/bench/core/convert/avro/AvroSchemaUtils.java b/java/bench/core/src/java/org/apache/orc/bench/core/convert/avro/AvroSchemaUtils.java index 96df6b5ba1..65753553a4 100644 --- a/java/bench/core/src/java/org/apache/orc/bench/core/convert/avro/AvroSchemaUtils.java +++ b/java/bench/core/src/java/org/apache/orc/bench/core/convert/avro/AvroSchemaUtils.java @@ -78,8 +78,11 @@ public static Schema createAvroSchema(TypeDescription typeInfo) { case DECIMAL: String precision = String.valueOf(typeInfo.getPrecision()); String scale = String.valueOf(typeInfo.getScale()); + int bytes = PRECISION_TO_BYTE_COUNT[typeInfo.getPrecision() - 1]; schema = getSchemaFor("{" + - "\"type\":\"bytes\"," + + "\"type\":\"fixed\"," + + "\"name\":\"" + typeInfo.getFullFieldName() + "\"," + + "\"size\":" + bytes + "," + "\"logicalType\":\"decimal\"," + "\"precision\":" + precision + "," + "\"scale\":" + scale + "}"); @@ -189,4 +192,16 @@ private static Schema getSchemaFor(String str) { Schema.Parser parser = new Schema.Parser(); return parser.parse(str); } + + // org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe + // Map precision to the number bytes needed for binary conversion. + public static final int[] PRECISION_TO_BYTE_COUNT = new int[38]; + + static { + for (int prec = 1; prec <= 38; prec++) { + // Estimated number of bytes needed. + PRECISION_TO_BYTE_COUNT[prec - 1] = (int) + Math.ceil((Math.log(Math.pow(10, prec) - 1) / Math.log(2) + 1) / 8); + } + } } diff --git a/java/bench/core/src/java/org/apache/orc/bench/core/convert/avro/AvroWriter.java b/java/bench/core/src/java/org/apache/orc/bench/core/convert/avro/AvroWriter.java index d60ef6745d..34fa166673 100644 --- a/java/bench/core/src/java/org/apache/orc/bench/core/convert/avro/AvroWriter.java +++ b/java/bench/core/src/java/org/apache/orc/bench/core/convert/avro/AvroWriter.java @@ -40,7 +40,6 @@ import org.apache.orc.bench.core.convert.BatchWriter; import java.io.IOException; -import java.nio.Buffer; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import java.util.List; @@ -166,8 +165,12 @@ public Object convert(ColumnVector cv, int row) { } private static class DecimalConverter implements AvroConverter { + final Schema avroSchema; + final int precision; final int scale; - DecimalConverter(int scale) { + DecimalConverter(Schema avroSchema, int precision, int scale) { + this.avroSchema = avroSchema; + this.precision = precision; this.scale = scale; } public Object convert(ColumnVector cv, int row) { @@ -176,8 +179,7 @@ public Object convert(ColumnVector cv, int row) { } if (cv.noNulls || !cv.isNull[row]) { DecimalColumnVector vector = (DecimalColumnVector) cv; - return getBufferFromDecimal( - vector.vector[row].getHiveDecimal(), scale); + return decimalToBinary(vector.vector[row].getHiveDecimal(), avroSchema, precision, scale); } else { return null; } @@ -270,7 +272,7 @@ public static AvroConverter createConverter(TypeDescription types, case TIMESTAMP: return new TimestampConverter(); case DECIMAL: - return new DecimalConverter(types.getScale()); + return new DecimalConverter(avroSchema, types.getPrecision(), types.getScale()); case LIST: return new ListConverter(types, avroSchema); case STRUCT: @@ -356,11 +358,28 @@ public void close() throws IOException { writer.close(); } - static Buffer getBufferFromDecimal(HiveDecimal dec, int scale) { - if (dec == null) { - return null; + // org.apache.hadoop.hive.ql.io.parquet.write.DataWritableWriter.DecimalDataWriter.decimalToBinary() + private static GenericData.Fixed decimalToBinary(HiveDecimal hiveDecimal, + Schema avroSchema, int prec, int scale) { + byte[] decimalBytes = hiveDecimal.setScale(scale).unscaledValue().toByteArray(); + + // Estimated number of bytes needed. + int precToBytes = AvroSchemaUtils.PRECISION_TO_BYTE_COUNT[prec - 1]; + if (precToBytes == decimalBytes.length) { + // No padding needed. + return new GenericData.Fixed(avroSchema, decimalBytes); + } + + byte[] tgt = new byte[precToBytes]; + if (hiveDecimal.signum() == -1) { + // For negative number, initializing bits to 1 + for (int i = 0; i < precToBytes; i++) { + tgt[i] |= 0xFF; + } } - return ByteBuffer.wrap(dec.bigIntegerBytesScaled(scale)); + System.arraycopy(decimalBytes, 0, tgt, precToBytes - decimalBytes.length, + decimalBytes.length); // Padding leading zeroes/ones. + return new GenericData.Fixed(avroSchema, tgt); } } diff --git a/java/bench/core/src/java/org/apache/orc/bench/core/convert/json/JsonReader.java b/java/bench/core/src/java/org/apache/orc/bench/core/convert/json/JsonReader.java index 893b738b1c..ece88f08b8 100644 --- a/java/bench/core/src/java/org/apache/orc/bench/core/convert/json/JsonReader.java +++ b/java/bench/core/src/java/org/apache/orc/bench/core/convert/json/JsonReader.java @@ -172,8 +172,12 @@ public void convert(JsonElement value, ColumnVector vect, int row) { vect.isNull[row] = true; } else { TimestampColumnVector vector = (TimestampColumnVector) vect; - vector.set(row, Timestamp.valueOf(value.getAsString() - .replaceAll("[TZ]", " "))); + try { + vector.set(row, new Timestamp(value.getAsLong())); + } catch (NumberFormatException e) { + vector.set(row, Timestamp.valueOf(value.getAsString() + .replaceAll("[TZ]", " "))); + } } } } diff --git a/java/bench/core/src/java/org/apache/orc/bench/core/convert/json/JsonWriter.java b/java/bench/core/src/java/org/apache/orc/bench/core/convert/json/JsonWriter.java index 00b3de22e6..527d8bf1cc 100644 --- a/java/bench/core/src/java/org/apache/orc/bench/core/convert/json/JsonWriter.java +++ b/java/bench/core/src/java/org/apache/orc/bench/core/convert/json/JsonWriter.java @@ -160,8 +160,7 @@ static void printValue(com.google.gson.stream.JsonWriter writer, ColumnVector ve (int) ((LongColumnVector) vector).vector[row]).toString()); break; case TIMESTAMP: - writer.value(((TimestampColumnVector) vector) - .asScratchTimestamp(row).toString()); + writer.value(((TimestampColumnVector) vector).getTimestampAsLong(row)); break; case LIST: printList(writer, (ListColumnVector) vector, schema, row); diff --git a/java/bench/core/src/resources/taxi.schema b/java/bench/core/src/resources/taxi.schema index 720848faaa..adb1f54f8d 100644 --- a/java/bench/core/src/resources/taxi.schema +++ b/java/bench/core/src/resources/taxi.schema @@ -9,13 +9,13 @@ struct< PULocationID: bigint, DOLocationID: bigint, payment_type: bigint, - fare_amount: decimal(8,2), - extra: decimal(8,2), - mta_tax: decimal(8,2), - tip_amount: decimal(8,2), - tolls_amount: decimal(8,2), - improvement_surcharge: decimal(8,2), - total_amount: decimal(8,2), + fare_amount: decimal(10,2), + extra: decimal(10,2), + mta_tax: decimal(10,2), + tip_amount: decimal(10,2), + tolls_amount: decimal(10,2), + improvement_surcharge: decimal(10,2), + total_amount: decimal(10,2), congestion_surcharge: int, airport_fee: int > diff --git a/java/bench/core/src/test/org/apache/orc/bench/core/impl/ChunkReadUtilTest.java b/java/bench/core/src/test/org/apache/orc/bench/core/impl/ChunkReadUtilTest.java index 1169998d86..7091927521 100644 --- a/java/bench/core/src/test/org/apache/orc/bench/core/impl/ChunkReadUtilTest.java +++ b/java/bench/core/src/test/org/apache/orc/bench/core/impl/ChunkReadUtilTest.java @@ -21,6 +21,9 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.util.VersionInfo; +import org.apache.orc.impl.HadoopShims; +import org.apache.orc.impl.HadoopShimsFactory; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; @@ -38,6 +41,9 @@ class ChunkReadUtilTest { private static long fileLength; private static final int ROW_COUNT = 524288; private static final int COL_COUNT = 16; + private static final HadoopShims SHIMS = HadoopShimsFactory.get(); + private static final boolean supportVectoredIO = + SHIMS.supportVectoredIO(VersionInfo.getVersion()); @BeforeAll public static void setup() throws IOException { @@ -57,7 +63,7 @@ public void testReadAll() throws IOException { Configuration conf = new Configuration(); readStart(); assertEquals(ROW_COUNT, ChunkReadUtil.readORCFile(filePath, conf, false)); - assertTrue((readEnd().getBytesRead() / (double) fileLength) > 1); + assertTrue(supportVectoredIO || (readEnd().getBytesRead() / (double) fileLength) > 1); } @Test @@ -75,7 +81,7 @@ public void testReadAlternateWMinSeekSize() throws IOException { readStart(); assertEquals(ROW_COUNT, ChunkReadUtil.readORCFile(filePath, conf, true)); double readFraction = readEnd().getBytesRead() / (double) fileLength; - assertTrue(readFraction > 1 && readFraction < 1.01); + assertTrue(supportVectoredIO || (readFraction > 1 && readFraction < 1.01)); } @Test @@ -85,6 +91,6 @@ public void testReadAlternateWMinSeekSizeDrop() throws IOException { readStart(); assertEquals(ROW_COUNT, ChunkReadUtil.readORCFile(filePath, conf, true)); double readFraction = readEnd().getBytesRead() / (double) fileLength; - assertTrue(readFraction > 1 && readFraction < 1.01); + assertTrue(supportVectoredIO || (readFraction > 1 && readFraction < 1.01)); } -} \ No newline at end of file +} diff --git a/java/bench/hive/pom.xml b/java/bench/hive/pom.xml index 8dba74a0de..52a447948e 100644 --- a/java/bench/hive/pom.xml +++ b/java/bench/hive/pom.xml @@ -17,7 +17,7 @@ org.apache.orc orc-benchmarks - 2.1.0-SNAPSHOT + 2.2.0-SNAPSHOT .. diff --git a/java/bench/hive/src/java/org/apache/orc/bench/hive/ColumnProjectionBenchmark.java b/java/bench/hive/src/java/org/apache/orc/bench/hive/ColumnProjectionBenchmark.java index 9c1b7fd21a..48806faffe 100644 --- a/java/bench/hive/src/java/org/apache/orc/bench/hive/ColumnProjectionBenchmark.java +++ b/java/bench/hive/src/java/org/apache/orc/bench/hive/ColumnProjectionBenchmark.java @@ -19,6 +19,7 @@ package org.apache.orc.bench.hive; import com.google.auto.service.AutoService; +import org.apache.commons.cli.CommandLine; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -38,6 +39,7 @@ import org.apache.orc.bench.core.IOCounters; import org.apache.orc.bench.core.OrcBenchmark; import org.apache.orc.bench.core.Utilities; +import org.apache.orc.bench.core.convert.GenerateVariants; import org.apache.parquet.hadoop.ParquetInputFormat; import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.BenchmarkMode; @@ -47,6 +49,7 @@ import org.openjdk.jmh.annotations.Scope; import org.openjdk.jmh.annotations.State; import org.openjdk.jmh.runner.Runner; +import org.openjdk.jmh.runner.options.OptionsBuilder; import java.net.URI; import java.util.List; @@ -78,7 +81,13 @@ public String getDescription() { @Override public void run(String[] args) throws Exception { - new Runner(Utilities.parseOptions(args, getClass())).run(); + CommandLine cmds = GenerateVariants.parseCommandLine(args); + new Runner(new OptionsBuilder() + .parent(Utilities.parseOptions(args, this.getClass())) + .param("compression", cmds.getOptionValue("compress", "snappy,gz,zstd").split(",")) + .param("dataset", cmds.getOptionValue("data", "github,sales,taxi").split(",")) + .build() + ).run(); } @Benchmark diff --git a/java/bench/hive/src/java/org/apache/orc/bench/hive/FullReadBenchmark.java b/java/bench/hive/src/java/org/apache/orc/bench/hive/FullReadBenchmark.java index dc1bcca922..8f3b1cbbaa 100644 --- a/java/bench/hive/src/java/org/apache/orc/bench/hive/FullReadBenchmark.java +++ b/java/bench/hive/src/java/org/apache/orc/bench/hive/FullReadBenchmark.java @@ -25,6 +25,7 @@ import org.apache.avro.generic.GenericRecord; import org.apache.avro.io.DatumReader; import org.apache.avro.mapred.FsInput; +import org.apache.commons.cli.CommandLine; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -45,6 +46,7 @@ import org.apache.orc.bench.core.IOCounters; import org.apache.orc.bench.core.OrcBenchmark; import org.apache.orc.bench.core.Utilities; +import org.apache.orc.bench.core.convert.GenerateVariants; import org.apache.parquet.hadoop.ParquetInputFormat; import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.BenchmarkMode; @@ -54,6 +56,7 @@ import org.openjdk.jmh.annotations.Scope; import org.openjdk.jmh.annotations.State; import org.openjdk.jmh.runner.Runner; +import org.openjdk.jmh.runner.options.OptionsBuilder; import java.io.InputStream; import java.io.InputStreamReader; @@ -87,7 +90,13 @@ public String getDescription() { @Override public void run(String[] args) throws Exception { - new Runner(Utilities.parseOptions(args, getClass())).run(); + CommandLine cmds = GenerateVariants.parseCommandLine(args); + new Runner(new OptionsBuilder() + .parent(Utilities.parseOptions(args, this.getClass())) + .param("compression", cmds.getOptionValue("compress", "gz,snappy,zstd").split(",")) + .param("dataset", cmds.getOptionValue("data", "taxi,sales,github").split(",")) + .build() + ).run(); } @Benchmark diff --git a/java/bench/pom.xml b/java/bench/pom.xml index a50eb3a425..085611e14b 100644 --- a/java/bench/pom.xml +++ b/java/bench/pom.xml @@ -17,7 +17,7 @@ org.apache.orc orc - 2.1.0-SNAPSHOT + 2.2.0-SNAPSHOT ../pom.xml @@ -33,13 +33,15 @@ - 1.11.3 - 4.0.0 + 1.12.0 + 4.0.1 1.37 - 5.10.2 + 5.12.1 ${project.version} - 1.13.1 - 3.5.1 + 1.15.1 + 2.13 + 2.13.14 + 4.0.0-preview2 @@ -71,7 +73,7 @@ com.google.code.gson gson - 2.2.4 + 2.13.0 com.google.guava @@ -80,17 +82,17 @@ commons-cli commons-cli - 1.6.0 + 1.9.0 io.airlift aircompressor - 0.26 + 2.0.2 io.netty netty-all - 4.1.96.Final + 4.1.110.Final runtime @@ -106,7 +108,7 @@ org.apache.commons commons-csv - 1.10.0 + 1.14.0 org.apache.hadoop @@ -275,7 +277,7 @@ org.xerial.snappy snappy-java - 1.1.10.5 + 1.1.10.7 org.apache.parquet @@ -284,12 +286,12 @@ org.apache.spark - spark-catalyst_2.12 + spark-catalyst_${scala.binary.version} ${spark.version} org.apache.spark - spark-core_2.12 + spark-core_${scala.binary.version} ${spark.version} @@ -316,7 +318,7 @@ org.apache.spark - spark-sql_2.12 + spark-sql_${scala.binary.version} ${spark.version} @@ -335,7 +337,7 @@ org.apache.spark - spark-avro_2.12 + spark-avro_${scala.binary.version} ${spark.version} @@ -357,7 +359,7 @@ org.scala-lang scala-library - 2.12.18 + ${scala.version} org.slf4j diff --git a/java/bench/spark/pom.xml b/java/bench/spark/pom.xml index 7eeef0d00a..31070bd2aa 100644 --- a/java/bench/spark/pom.xml +++ b/java/bench/spark/pom.xml @@ -17,7 +17,7 @@ org.apache.orc orc-benchmarks - 2.1.0-SNAPSHOT + 2.2.0-SNAPSHOT .. @@ -71,15 +71,15 @@ org.apache.spark - spark-catalyst_2.12 + spark-catalyst_${scala.binary.version} org.apache.spark - spark-core_2.12 + spark-core_${scala.binary.version} org.apache.spark - spark-sql_2.12 + spark-sql_${scala.binary.version} org.apache.parquet @@ -88,7 +88,7 @@ org.apache.spark - spark-avro_2.12 + spark-avro_${scala.binary.version} org.jodd @@ -125,7 +125,7 @@ org.objenesis objenesis - 3.2 + 3.3 compile diff --git a/java/bench/spark/src/java/org/apache/orc/bench/spark/SparkBenchmark.java b/java/bench/spark/src/java/org/apache/orc/bench/spark/SparkBenchmark.java index 1285875dcf..86e65ae81e 100644 --- a/java/bench/spark/src/java/org/apache/orc/bench/spark/SparkBenchmark.java +++ b/java/bench/spark/src/java/org/apache/orc/bench/spark/SparkBenchmark.java @@ -61,9 +61,9 @@ import scala.Tuple2; import scala.collection.Iterator; import scala.collection.JavaConverters; -import scala.collection.Seq; import scala.collection.immutable.Map; import scala.collection.immutable.Map$; +import scala.collection.immutable.Seq; import java.io.IOException; import java.sql.Timestamp; @@ -74,7 +74,8 @@ @BenchmarkMode(Mode.AverageTime) @OutputTimeUnit(TimeUnit.MICROSECONDS) @AutoService(OrcBenchmark.class) -@Fork(jvmArgsAppend = "--add-opens=java.base/sun.nio.ch=ALL-UNNAMED") +@Fork(jvmArgsAppend = {"--add-opens=java.base/sun.nio.ch=ALL-UNNAMED", + "--add-opens=java.base/sun.util.calendar=ALL-UNNAMED"}) public class SparkBenchmark implements OrcBenchmark { private static final Path root = Utilities.getBenchmarkRoot(); @@ -195,6 +196,9 @@ public void fullRead(InputSource source, case "orc": options.add(new Tuple2<>("returning_batch", "true")); // SPARK-40918 break; + case "parquet": + options.add(new Tuple2<>("returning_batch", "true")); // SPARK-40918 + break; default: break; } @@ -228,6 +232,9 @@ public void partialRead(InputSource source, case "orc": options.add(new Tuple2<>("returning_batch", "true")); // SPARK-40918 break; + case "parquet": + options.add(new Tuple2<>("returning_batch", "true")); // SPARK-40918 + break; default: break; } @@ -303,6 +310,9 @@ public void pushDown(InputSource source, case "orc": options.add(new Tuple2<>("returning_batch", "true")); // SPARK-40918 break; + case "parquet": + options.add(new Tuple2<>("returning_batch", "true")); // SPARK-40918 + break; default: break; } diff --git a/java/core/pom.xml b/java/core/pom.xml index 4cafffc714..5813364532 100644 --- a/java/core/pom.xml +++ b/java/core/pom.xml @@ -17,7 +17,7 @@ org.apache.orc orc - 2.1.0-SNAPSHOT + 2.2.0-SNAPSHOT ../pom.xml @@ -43,10 +43,6 @@ com.google.protobuf protobuf-java - - org.apache.commons - commons-lang3 - io.airlift aircompressor @@ -86,6 +82,11 @@ + + org.apache.commons + commons-lang3 + test + com.google.guava guava diff --git a/java/core/src/java/org/apache/orc/OrcConf.java b/java/core/src/java/org/apache/orc/OrcConf.java index 9bc2b4492e..6516517ba2 100644 --- a/java/core/src/java/org/apache/orc/OrcConf.java +++ b/java/core/src/java/org/apache/orc/OrcConf.java @@ -18,7 +18,6 @@ package org.apache.orc; -import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import java.util.ArrayList; @@ -353,12 +352,12 @@ public String getString(Configuration conf) { public List getStringAsList(Configuration conf) { String value = getString(null, conf); List confList = new ArrayList<>(); - if (StringUtils.isEmpty(value)) { + if (value == null || value.isEmpty()) { return confList; } for (String str: value.split(",")) { - String trimStr = StringUtils.trim(str); - if (StringUtils.isNotEmpty(trimStr)) { + String trimStr = str.trim(); + if (!trimStr.isEmpty()) { confList.add(trimStr); } } diff --git a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java index c5e13cc3c0..d6147050ec 100644 --- a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java +++ b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java @@ -102,6 +102,8 @@ public void updateBoolean(boolean value, int repetitions) { public void merge(ColumnStatisticsImpl other) { if (other instanceof BooleanStatisticsImpl bkt) { trueCount += bkt.trueCount; + } else if (!(other instanceof BooleanColumnStatistics)) { + throw new IllegalArgumentException("Incompatible merging of boolean column statistics"); } else { if (isStatsExists() && trueCount != 0) { throw new IllegalArgumentException("Incompatible merging of boolean column statistics"); @@ -222,6 +224,8 @@ public void merge(ColumnStatisticsImpl other) { } } sum += otherColl.sum; + } else if (!(other instanceof CollectionColumnStatistics)) { + throw new IllegalArgumentException("Incompatible merging of collection column statistics"); } else { if (isStatsExists()) { throw new IllegalArgumentException("Incompatible merging of collection column statistics"); @@ -397,6 +401,8 @@ public void merge(ColumnStatisticsImpl other) { overflow = true; } } + } else if (!(other instanceof IntegerColumnStatistics)) { + throw new IllegalArgumentException("Incompatible merging of integer column statistics"); } else { if (isStatsExists() && hasMinimum) { throw new IllegalArgumentException("Incompatible merging of integer column statistics"); @@ -560,6 +566,8 @@ public void merge(ColumnStatisticsImpl other) { } } sum += dbl.sum; + } else if (!(other instanceof DoubleColumnStatistics)) { + throw new IllegalArgumentException("Incompatible merging of double column statistics"); } else { if (isStatsExists() && hasMinimum) { throw new IllegalArgumentException("Incompatible merging of double column statistics"); @@ -763,6 +771,8 @@ public void merge(ColumnStatisticsImpl other) { } } sum += str.sum; + } else if (!(other instanceof StringColumnStatistics)) { + throw new IllegalArgumentException("Incompatible merging of string column statistics"); } else { if (isStatsExists()) { throw new IllegalArgumentException("Incompatible merging of string column statistics"); @@ -993,9 +1003,10 @@ public void updateBinary(byte[] bytes, int offset, int length, @Override public void merge(ColumnStatisticsImpl other) { - if (other instanceof BinaryColumnStatistics) { - BinaryStatisticsImpl bin = (BinaryStatisticsImpl) other; + if (other instanceof BinaryStatisticsImpl bin) { sum += bin.sum; + } else if (!(other instanceof BinaryColumnStatistics)) { + throw new IllegalArgumentException("Incompatible merging of binary column statistics"); } else { if (isStatsExists() && sum != 0) { throw new IllegalArgumentException("Incompatible merging of binary column statistics"); @@ -1128,6 +1139,8 @@ public void merge(ColumnStatisticsImpl other) { sum.mutateAdd(dec.sum); } } + } else if (!(other instanceof DecimalColumnStatistics)) { + throw new IllegalArgumentException("Incompatible merging of decimal column statistics"); } else { if (isStatsExists() && minimum != null) { throw new IllegalArgumentException("Incompatible merging of decimal column statistics"); @@ -1321,6 +1334,8 @@ public void merge(ColumnStatisticsImpl other) { hasSum = false; } } + } else if (!(other instanceof DecimalColumnStatistics)) { + throw new IllegalArgumentException("Incompatible merging of decimal column statistics"); } else { if (other.getNumberOfValues() != 0) { throw new IllegalArgumentException("Incompatible merging of decimal column statistics"); @@ -1486,6 +1501,8 @@ public void merge(ColumnStatisticsImpl other) { if (other instanceof DateStatisticsImpl dateStats) { minimum = Math.min(minimum, dateStats.minimum); maximum = Math.max(maximum, dateStats.maximum); + } else if (!(other instanceof DateColumnStatistics)) { + throw new IllegalArgumentException("Incompatible merging of date column statistics"); } else { if (isStatsExists() && count != 0) { throw new IllegalArgumentException("Incompatible merging of date column statistics"); @@ -1698,6 +1715,8 @@ public void merge(ColumnStatisticsImpl other) { maximum = timestampStats.maximum; } } + } else if (!(other instanceof TimestampColumnStatistics)) { + throw new IllegalArgumentException("Incompatible merging of timestamp column statistics"); } else { if (isStatsExists() && count != 0) { throw new IllegalArgumentException("Incompatible merging of timestamp column statistics"); diff --git a/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java b/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java index 4635973ab5..6886b551e8 100644 --- a/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java +++ b/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java @@ -17,7 +17,6 @@ */ package org.apache.orc.impl; -import org.apache.commons.lang3.ArrayUtils; import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; @@ -1446,6 +1445,7 @@ public void nextVector(ColumnVector previousVector, } public static class StringGroupFromBinaryTreeReader extends ConvertTreeReader { + public static final byte[] EMPTY_BYTE_ARRAY = new byte[0]; private final TypeDescription readerType; private BytesColumnVector inBytesColVector; private BytesColumnVector outBytesColVector; @@ -1461,7 +1461,7 @@ public void setConvertVectorElement(int elementNum) throws IOException { byte[] bytes = inBytesColVector.vector[elementNum]; int start = inBytesColVector.start[elementNum]; int length = inBytesColVector.length[elementNum]; - final byte[] string = (length == 0) ? ArrayUtils.EMPTY_BYTE_ARRAY : new byte[3 * length - 1]; + final byte[] string = (length == 0) ? EMPTY_BYTE_ARRAY : new byte[3 * length - 1]; for(int p = 0; p < string.length; p += 2) { if (p != 0) { string[p++] = ' '; diff --git a/java/core/src/java/org/apache/orc/impl/ParserUtils.java b/java/core/src/java/org/apache/orc/impl/ParserUtils.java index df2f8b5e19..c864465bde 100644 --- a/java/core/src/java/org/apache/orc/impl/ParserUtils.java +++ b/java/core/src/java/org/apache/orc/impl/ParserUtils.java @@ -31,6 +31,8 @@ import java.util.regex.Pattern; public class ParserUtils { + private static final TypeDescription.Category[] TYPE_DESCRIPTION_CATEGORY_VALUES + = TypeDescription.Category.values(); static TypeDescription.Category parseCategory(ParserUtils.StringPosition source) { StringBuilder word = new StringBuilder(); @@ -56,7 +58,7 @@ static TypeDescription.Category parseCategory(ParserUtils.StringPosition source) catString = catString.trim(); } if (!catString.isEmpty()) { - for (TypeDescription.Category cat : TypeDescription.Category.values()) { + for (TypeDescription.Category cat : TYPE_DESCRIPTION_CATEGORY_VALUES) { if (cat.getName().equals(catString)) { return cat; } diff --git a/java/core/src/java/org/apache/orc/impl/ReaderImpl.java b/java/core/src/java/org/apache/orc/impl/ReaderImpl.java index 3afbff5fc3..9e018157f6 100644 --- a/java/core/src/java/org/apache/orc/impl/ReaderImpl.java +++ b/java/core/src/java/org/apache/orc/impl/ReaderImpl.java @@ -65,6 +65,10 @@ public class ReaderImpl implements Reader { private static final Logger LOG = LoggerFactory.getLogger(ReaderImpl.class); + private static final OrcFile.Version[] ORC_FILE_VERSION_VALUES = OrcFile.Version.values(); + private static final OrcFile.WriterVersion[] ORC_FILE_WRITER_VERSION_VALUES + = OrcFile.WriterVersion.values(); + private static final int DIRECTORY_SIZE_GUESS = 16 * 1024; public static final int DEFAULT_COMPRESSION_BLOCK_SIZE = 256 * 1024; @@ -268,7 +272,7 @@ public static OrcFile.Version getFileVersion(List versionList) { if (versionList == null || versionList.isEmpty()) { return OrcFile.Version.V_0_11; } - for (OrcFile.Version version: OrcFile.Version.values()) { + for (OrcFile.Version version: ORC_FILE_VERSION_VALUES) { if (version.getMajor() == versionList.get(0) && version.getMinor() == versionList.get(1)) { return version; @@ -620,7 +624,7 @@ protected Supplier getFileSystemSupplier() { * @return the version of the software that produced the file */ public static OrcFile.WriterVersion getWriterVersion(int writerVersion) { - for(OrcFile.WriterVersion version: OrcFile.WriterVersion.values()) { + for(OrcFile.WriterVersion version: ORC_FILE_WRITER_VERSION_VALUES) { if (version.getId() == writerVersion) { return version; } diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java index 323f242471..c9256964e5 100644 --- a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java +++ b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java @@ -17,7 +17,6 @@ */ package org.apache.orc.impl; -import org.apache.commons.lang3.ArrayUtils; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.type.HiveDecimal; @@ -340,7 +339,14 @@ protected RecordReaderImpl(ReaderImpl fileReader, this.startReadPhase = TypeReader.ReadPhase.ALL; } - this.rowIndexColsToRead = ArrayUtils.contains(rowIndexCols, true) ? rowIndexCols : null; + var hasTrue = false; + for (boolean value: rowIndexCols) { + if (value) { + hasTrue = true; + break; + } + } + this.rowIndexColsToRead = hasTrue ? rowIndexCols : null; TreeReaderFactory.ReaderContext readerContext = new TreeReaderFactory.ReaderContext() .setSchemaEvolution(evolution) diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderUtils.java b/java/core/src/java/org/apache/orc/impl/RecordReaderUtils.java index 0eabb421e0..b4155ada81 100644 --- a/java/core/src/java/org/apache/orc/impl/RecordReaderUtils.java +++ b/java/core/src/java/org/apache/orc/impl/RecordReaderUtils.java @@ -17,12 +17,12 @@ */ package org.apache.orc.impl; -import org.apache.commons.lang3.builder.HashCodeBuilder; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileRange; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.io.DiskRangeList; +import org.apache.hadoop.util.VersionInfo; import org.apache.orc.CompressionCodec; import org.apache.orc.DataReader; import org.apache.orc.OrcProto; @@ -48,7 +48,8 @@ */ public class RecordReaderUtils { private static final HadoopShims SHIMS = HadoopShimsFactory.get(); - private static final boolean supportVectoredIO = SHIMS.supportVectoredIO(); + private static final boolean supportVectoredIO = + SHIMS.supportVectoredIO(VersionInfo.getVersion()); private static final Logger LOG = LoggerFactory.getLogger(RecordReaderUtils.class); private static class DefaultDataReader implements DataReader { @@ -635,8 +636,8 @@ public boolean equals(Object rhs) { @Override public int hashCode() { - return new HashCodeBuilder().append(capacity).append(insertionGeneration) - .toHashCode(); + // This is idential to the previous hashCode from HashCodeBuilder + return (17 * 37 + capacity) * 37 + (int) (insertionGeneration ^ insertionGeneration >> 32); } } diff --git a/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java b/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java index 2a2adf50d7..418b9c9561 100644 --- a/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java +++ b/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java @@ -1551,7 +1551,6 @@ private void nextVector(DecimalColumnVector result, HiveDecimalWritable[] vector = result.vector; HiveDecimalWritable decWritable; if (result.noNulls) { - result.isRepeating = true; for (int r = 0; r < batchSize; ++r) { decWritable = vector[r]; if (!decWritable.serializationUtilsRead( @@ -1563,7 +1562,6 @@ private void nextVector(DecimalColumnVector result, setIsRepeatingIfNeeded(result, r); } } else if (!result.isRepeating || !result.isNull[0]) { - result.isRepeating = true; for (int r = 0; r < batchSize; ++r) { if (!result.isNull[r]) { decWritable = vector[r]; @@ -1595,7 +1593,6 @@ private void nextVector(DecimalColumnVector result, HiveDecimalWritable[] vector = result.vector; HiveDecimalWritable decWritable; if (result.noNulls) { - result.isRepeating = true; int previousIdx = 0; for (int r = 0; r != filterContext.getSelectedSize(); ++r) { int idx = filterContext.getSelected()[r]; @@ -1614,7 +1611,6 @@ private void nextVector(DecimalColumnVector result, } skipStreamRows(batchSize - previousIdx); } else if (!result.isRepeating || !result.isNull[0]) { - result.isRepeating = true; int previousIdx = 0; for (int r = 0; r != filterContext.getSelectedSize(); ++r) { int idx = filterContext.getSelected()[r]; @@ -1651,14 +1647,12 @@ private void nextVector(Decimal64ColumnVector result, // read the scales scaleReader.nextVector(result, scratchScaleVector, batchSize); if (result.noNulls) { - result.isRepeating = true; for (int r = 0; r < batchSize; ++r) { final long scaleFactor = powerOfTenTable[scale - scratchScaleVector[r]]; result.vector[r] = SerializationUtils.readVslong(valueStream) * scaleFactor; setIsRepeatingIfNeeded(result, r); } } else if (!result.isRepeating || !result.isNull[0]) { - result.isRepeating = true; for (int r = 0; r < batchSize; ++r) { if (!result.isNull[r]) { final long scaleFactor = powerOfTenTable[scale - scratchScaleVector[r]]; @@ -1686,7 +1680,6 @@ private void nextVector(Decimal64ColumnVector result, // Read all the scales scaleReader.nextVector(result, scratchScaleVector, batchSize); if (result.noNulls) { - result.isRepeating = true; int previousIdx = 0; for (int r = 0; r != filterContext.getSelectedSize(); r++) { int idx = filterContext.getSelected()[r]; @@ -1702,7 +1695,6 @@ private void nextVector(Decimal64ColumnVector result, } skipStreamRows(batchSize - previousIdx); } else if (!result.isRepeating || !result.isNull[0]) { - result.isRepeating = true; int previousIdx = 0; for (int r = 0; r != filterContext.getSelectedSize(); r++) { int idx = filterContext.getSelected()[r]; diff --git a/java/core/src/java/org/apache/orc/impl/ZlibCodec.java b/java/core/src/java/org/apache/orc/impl/ZlibCodec.java index 398ac0d16b..d4275a4c26 100644 --- a/java/core/src/java/org/apache/orc/impl/ZlibCodec.java +++ b/java/core/src/java/org/apache/orc/impl/ZlibCodec.java @@ -169,6 +169,17 @@ public void decompress(ByteBuffer in, ByteBuffer out) throws IOException { out.arrayOffset() + out.position(), out.remaining()); out.position(count + out.position()); + + if (!inflater.finished() && !inflater.needsDictionary() && !inflater.needsInput() && + count == 0) { + if (out.remaining() == 0) { + throw new IOException("Decompress output buffer too small. in = " + in + + ", out = " + out); + } else { + throw new IOException("Decompress error. in = " + in + + ", out = " + out); + } + } } catch (DataFormatException dfe) { throw new IOException("Bad compression data", dfe); } diff --git a/java/core/src/java/org/apache/orc/impl/mask/RedactMaskFactory.java b/java/core/src/java/org/apache/orc/impl/mask/RedactMaskFactory.java index c6b65c3e8f..1debb93497 100644 --- a/java/core/src/java/org/apache/orc/impl/mask/RedactMaskFactory.java +++ b/java/core/src/java/org/apache/orc/impl/mask/RedactMaskFactory.java @@ -17,7 +17,6 @@ */ package org.apache.orc.impl.mask; -import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; @@ -135,7 +134,7 @@ public RedactMaskFactory(String... params) { OTHER_NUMBER_REPLACEMENT = getNextCodepoint(param, DEFAULT_NUMBER_OTHER); OTHER_REPLACEMENT = getNextCodepoint(param, DEFAULT_OTHER); String[] timeParams; - if (params.length < 2 || StringUtils.isBlank(params[1])) { + if (params.length < 2 || params[1].isBlank()) { timeParams = null; } else { timeParams = params[1].split("\\W+"); @@ -154,7 +153,7 @@ public RedactMaskFactory(String... params) { (SECOND_REPLACEMENT != UNMASKED_DATE); /* un-mask range */ - if(!(params.length < 3 || StringUtils.isBlank(params[2]))) { + if(!(params.length < 3 || params[2].isBlank())) { String[] unmaskIndexes = params[2].split(","); for(int i=0; i < unmaskIndexes.length; i++ ) { diff --git a/java/core/src/test/org/apache/orc/TestColumnStatistics.java b/java/core/src/test/org/apache/orc/TestColumnStatistics.java index 2ef96e5f50..ddcbcdc1ac 100644 --- a/java/core/src/test/org/apache/orc/TestColumnStatistics.java +++ b/java/core/src/test/org/apache/orc/TestColumnStatistics.java @@ -27,6 +27,7 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.serde2.io.DateWritable; import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; +import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; import org.apache.orc.impl.ColumnStatisticsImpl; import org.junit.jupiter.api.BeforeEach; @@ -44,6 +45,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; /** @@ -699,6 +701,47 @@ public void testDecimalMinMaxStatistics() throws Exception { "Incorrect minimum value"); } + @Test + public void testBinaryMerge() { + TypeDescription schema = TypeDescription.createBinary(); + + ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema); + ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema); + stats1.increment(3); + stats1.updateBinary(new BytesWritable("bob".getBytes(StandardCharsets.UTF_8))); + stats1.updateBinary(new BytesWritable("david".getBytes(StandardCharsets.UTF_8))); + stats1.updateBinary(new BytesWritable("charles".getBytes(StandardCharsets.UTF_8))); + stats2.increment(2); + stats2.updateBinary(new BytesWritable("anne".getBytes(StandardCharsets.UTF_8))); + stats2.updateBinary(new BytesWritable("abcdef".getBytes(StandardCharsets.UTF_8))); + + assertEquals(15, ((BinaryColumnStatistics) stats1).getSum()); + assertEquals(10, ((BinaryColumnStatistics) stats2).getSum()); + + stats1.merge(stats2); + + assertEquals(25, ((BinaryColumnStatistics) stats1).getSum()); + } + + @Test + public void testMergeIncompatible() { + TypeDescription stringSchema = TypeDescription.createString(); + ColumnStatisticsImpl stringStats = ColumnStatisticsImpl.create(stringSchema); + + TypeDescription doubleSchema = TypeDescription.createDouble(); + ColumnStatisticsImpl doubleStats = ColumnStatisticsImpl.create(doubleSchema); + + stringStats.increment(3); + stringStats.updateString(new Text("bob")); + stringStats.updateString(new Text("david")); + stringStats.updateString(new Text("charles")); + + assertThrows(IllegalArgumentException.class, () -> { + doubleStats.merge(stringStats); + }); + + assertEquals(0, ((DoubleColumnStatistics) doubleStats).getNumberOfValues()); + } Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" + File.separator + "tmp")); diff --git a/java/core/src/test/org/apache/orc/impl/TestConvertTreeReaderFactory.java b/java/core/src/test/org/apache/orc/impl/TestConvertTreeReaderFactory.java index a90a285a65..860b18aa7e 100644 --- a/java/core/src/test/org/apache/orc/impl/TestConvertTreeReaderFactory.java +++ b/java/core/src/test/org/apache/orc/impl/TestConvertTreeReaderFactory.java @@ -707,7 +707,7 @@ private void readDecimalInNullStripe(String typeString, Class expectedColumnT assertTrue(batch.cols[0].isRepeating); StringBuilder sb = new StringBuilder(); batch.cols[0].stringifyValue(sb, 1023); - assertEquals(sb.toString(), expectedResult[0]); + assertEquals(expectedResult[0], sb.toString()); rows.nextBatch(batch); assertEquals(1024, batch.size); @@ -717,17 +717,17 @@ private void readDecimalInNullStripe(String typeString, Class expectedColumnT assertFalse(batch.cols[0].isRepeating); StringBuilder sb2 = new StringBuilder(); batch.cols[0].stringifyValue(sb2, 1023); - assertEquals(sb2.toString(), expectedResult[1]); + assertEquals(expectedResult[1], sb2.toString()); rows.nextBatch(batch); assertEquals(1024, batch.size); assertEquals(expected, options.toString()); assertEquals(batch.cols.length, 1); assertEquals(batch.cols[0].getClass(), expectedColumnType); - assertTrue(batch.cols[0].isRepeating); + assertFalse(batch.cols[0].isRepeating); StringBuilder sb3 = new StringBuilder(); batch.cols[0].stringifyValue(sb3, 1023); - assertEquals(sb3.toString(), expectedResult[2]); + assertEquals(expectedResult[2], sb3.toString()); } private void testDecimalConvertToLongInNullStripe() throws Exception { diff --git a/java/core/src/test/org/apache/orc/impl/TestPredicatePushDownBounds.java b/java/core/src/test/org/apache/orc/impl/TestPredicatePushDownBounds.java index c2799ff901..aec865201c 100644 --- a/java/core/src/test/org/apache/orc/impl/TestPredicatePushDownBounds.java +++ b/java/core/src/test/org/apache/orc/impl/TestPredicatePushDownBounds.java @@ -17,7 +17,6 @@ */ package org.apache.orc.impl; -import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf; import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory; @@ -54,26 +53,20 @@ public void testCornerCases() { BloomFilter bf = new BloomFilter(100); // FFF... to PPP... for (int i = 70; i <= 80; i++) { - final String inputString = StringUtils - .repeat(Character.toString((char) i), stringLength); + final String inputString = Character.toString((char) i).repeat(stringLength); bf.addString(inputString); } - final String longStringF = StringUtils - .repeat(Character.toString('F'), stringLength); - final String longStringP = StringUtils - .repeat(Character.toString('P'), stringLength); + final String longStringF = Character.toString('F').repeat(stringLength); + final String longStringP = Character.toString('P').repeat(stringLength); /* String that matches the upperbound value after truncation */ - final String upperboundString = - StringUtils.repeat(Character.toString('P'), 1023) + "Q"; + final String upperboundString = Character.toString('P').repeat(1023) + "Q"; /* String that matches the lower value after truncation */ - final String lowerboundString = StringUtils - .repeat(Character.toString('F'), 1024); + final String lowerboundString = Character.toString('F').repeat(1024); - final String shortStringF = StringUtils.repeat(Character.toString('F'), 50); - final String shortStringP = - StringUtils.repeat(Character.toString('P'), 50) + "Q"; + final String shortStringF = Character.toString('F').repeat(50); + final String shortStringP = Character.toString('P').repeat(50) + "Q"; /* Test for a case EQUALS where only upperbound is set */ final PredicateLeaf predicateUpperBoundEquals = TestRecordReaderImpl @@ -165,17 +158,13 @@ public void testNormalCase() throws Exception { BloomFilter bf = new BloomFilter(100); // FFF... to PPP... for (int i = 70; i <= 80; i++) { - final String inputString = StringUtils - .repeat(Character.toString((char) i), bfStringLength); + final String inputString = Character.toString((char) i).repeat(bfStringLength); bf.addString(inputString); } - final String longStringF = StringUtils - .repeat(Character.toString('F'), stringLength); - final String longStringP = StringUtils - .repeat(Character.toString('P'), stringLength); - final String predicateString = StringUtils - .repeat(Character.toString('I'), 50); + final String longStringF = Character.toString('F').repeat(stringLength); + final String longStringP = Character.toString('P').repeat(stringLength); + final String predicateString = Character.toString('I').repeat(50); /* Test for a case where only upperbound is set */ @@ -215,26 +204,20 @@ public void testIN() throws Exception { final BloomFilter bf = new BloomFilter(100); // FFF... to PPP... for (int i = 70; i <= 80; i++) { - final String inputString = StringUtils - .repeat(Character.toString((char) i), stringLength); + final String inputString = Character.toString((char) i).repeat(stringLength); bf.addString(inputString); } - final String longStringF = StringUtils - .repeat(Character.toString('F'), stringLength); - final String longStringP = StringUtils - .repeat(Character.toString('P'), stringLength); + final String longStringF = Character.toString('F').repeat(stringLength); + final String longStringP = Character.toString('P').repeat(stringLength); /* String that matches the upperbound value after truncation */ - final String upperboundString = - StringUtils.repeat(Character.toString('P'), 1023) + "Q"; + final String upperboundString = Character.toString('P').repeat(1023) + "Q"; /* String that matches the lower value after truncation */ - final String lowerboundString = StringUtils - .repeat(Character.toString('F'), 1024); + final String lowerboundString = Character.toString('F').repeat(1024); - final String shortStringF = StringUtils.repeat(Character.toString('F'), 50); - final String shortStringP = - StringUtils.repeat(Character.toString('P'), 50) + "Q"; + final String shortStringF = Character.toString('F').repeat(50); + final String shortStringP = Character.toString('P').repeat(50) + "Q"; final List args = new ArrayList(); args.add(upperboundString); diff --git a/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java b/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java index f0124715b8..378f0fcdad 100644 --- a/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java +++ b/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java @@ -28,6 +28,7 @@ import org.apache.hadoop.hive.common.io.DiskRangeList; import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; @@ -2732,4 +2733,71 @@ public void testHadoopVectoredIO() throws Exception { verify(spyFSDataInputStream, atLeastOnce()).readVectored(any(), any()); } + + @Test + public void testDecimalIsRepeatingFlag() throws IOException { + Configuration conf = new Configuration(); + FileSystem fs = FileSystem.get(conf); + Path testFilePath = new Path(workDir, "testDecimalIsRepeatingFlag.orc"); + fs.delete(testFilePath, true); + + Configuration decimalConf = new Configuration(conf); + decimalConf.set(OrcConf.STRIPE_ROW_COUNT.getAttribute(), "1024"); + decimalConf.set(OrcConf.ROWS_BETWEEN_CHECKS.getAttribute(), "1"); + String typeStr = "decimal(20,10)"; + TypeDescription schema = TypeDescription.fromString("struct"); + Writer w = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(decimalConf).setSchema(schema)); + + VectorizedRowBatch b = schema.createRowBatch(); + DecimalColumnVector f1 = (DecimalColumnVector) b.cols[0]; + for (int i = 0; i < 1024; i++) { + f1.set(i, HiveDecimal.create("-119.4594594595")); + } + b.size = 1024; + w.addRowBatch(b); + + b.reset(); + for (int i = 0; i < 1024; i++) { + f1.set(i, HiveDecimal.create("9318.4351351351")); + } + b.size = 1024; + w.addRowBatch(b); + + b.reset(); + for (int i = 0; i < 1024; i++) { + f1.set(i, HiveDecimal.create("-4298.1513513514")); + } + b.size = 1024; + w.addRowBatch(b); + + b.reset(); + w.close(); + + Reader.Options options = new Reader.Options(); + try (Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf)); + RecordReader rows = reader.rows(options)) { + VectorizedRowBatch batch = schema.createRowBatch(); + + rows.nextBatch(batch); + assertEquals(1024, batch.size); + assertFalse(batch.cols[0].isRepeating); + for (HiveDecimalWritable hiveDecimalWritable : ((DecimalColumnVector) batch.cols[0]).vector) { + assertEquals(HiveDecimal.create("-119.4594594595"), hiveDecimalWritable.getHiveDecimal()); + } + + rows.nextBatch(batch); + assertEquals(1024, batch.size); + assertFalse(batch.cols[0].isRepeating); + for (HiveDecimalWritable hiveDecimalWritable : ((DecimalColumnVector) batch.cols[0]).vector) { + assertEquals(HiveDecimal.create("9318.4351351351"), hiveDecimalWritable.getHiveDecimal()); + } + + rows.nextBatch(batch); + assertEquals(1024, batch.size); + assertFalse(batch.cols[0].isRepeating); + for (HiveDecimalWritable hiveDecimalWritable : ((DecimalColumnVector) batch.cols[0]).vector) { + assertEquals(HiveDecimal.create("-4298.1513513514"), hiveDecimalWritable.getHiveDecimal()); + } + } + } } diff --git a/java/core/src/test/org/apache/orc/impl/TestZlib.java b/java/core/src/test/org/apache/orc/impl/TestZlib.java index 4ca62ca2af..6e940923ed 100644 --- a/java/core/src/test/org/apache/orc/impl/TestZlib.java +++ b/java/core/src/test/org/apache/orc/impl/TestZlib.java @@ -18,13 +18,21 @@ package org.apache.orc.impl; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.orc.CompressionCodec; +import org.apache.orc.OrcFile; +import org.apache.orc.Reader; +import org.apache.orc.RecordReader; import org.junit.jupiter.api.Test; import java.io.IOException; import java.nio.ByteBuffer; import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.fail; public class TestZlib { @@ -54,4 +62,24 @@ public void testCorrupt() throws Exception { // EXPECTED } } + + @Test + public void testCorruptZlibFile() { + Configuration conf = new Configuration(); + Path testFilePath = new Path(ClassLoader. + getSystemResource("orc_corrupt_zlib.orc").getPath()); + + IOException exception = assertThrows( + IOException.class, + () -> { + try (Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf))) { + RecordReader rows = reader.rows(); + VectorizedRowBatch batch = reader.getSchema().createRowBatch(); + while (rows.nextBatch(batch)) { + } + } + } + ); + assertTrue(exception.getMessage().contains("Decompress output buffer too small")); + } } diff --git a/java/core/src/test/resources/orc_corrupt_zlib.orc b/java/core/src/test/resources/orc_corrupt_zlib.orc new file mode 100644 index 0000000000..e083a07c84 Binary files /dev/null and b/java/core/src/test/resources/orc_corrupt_zlib.orc differ diff --git a/java/examples/pom.xml b/java/examples/pom.xml index 119e00b0d4..5f7b8b69ac 100644 --- a/java/examples/pom.xml +++ b/java/examples/pom.xml @@ -17,7 +17,7 @@ org.apache.orc orc - 2.1.0-SNAPSHOT + 2.2.0-SNAPSHOT ../pom.xml diff --git a/java/mapreduce/pom.xml b/java/mapreduce/pom.xml index 30bd83e5ba..6f5b0c02ce 100644 --- a/java/mapreduce/pom.xml +++ b/java/mapreduce/pom.xml @@ -17,7 +17,7 @@ org.apache.orc orc - 2.1.0-SNAPSHOT + 2.2.0-SNAPSHOT ../pom.xml @@ -39,10 +39,6 @@ com.esotericsoftware kryo-shaded - - org.apache.commons - commons-lang3 - com.google.guava guava diff --git a/java/mapreduce/src/java/org/apache/orc/mapred/OrcInputFormat.java b/java/mapreduce/src/java/org/apache/orc/mapred/OrcInputFormat.java index ddb6a6ac2a..947d9b6f80 100644 --- a/java/mapreduce/src/java/org/apache/orc/mapred/OrcInputFormat.java +++ b/java/mapreduce/src/java/org/apache/orc/mapred/OrcInputFormat.java @@ -21,7 +21,6 @@ import com.esotericsoftware.kryo.Kryo; import com.esotericsoftware.kryo.io.Input; import com.esotericsoftware.kryo.io.Output; -import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; @@ -68,7 +67,7 @@ public static boolean[] parseInclude(TypeDescription schema, boolean[] result = new boolean[schema.getMaximumId() + 1]; result[0] = true; - if (StringUtils.isBlank(columnsStr)) { + if (columnsStr.isBlank()) { return result; } diff --git a/java/pom.xml b/java/pom.xml index 578e69a53d..07d0f675b0 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -21,7 +21,7 @@ org.apache.orc orc - 2.1.0-SNAPSHOT + 2.2.0-SNAPSHOT pom Apache ORC @@ -60,31 +60,31 @@ - 1.78 - 1.16.0 - 10.15.0 + 1.80 + 1.18.0 + 10.23.0 ${project.basedir}/../../examples - 3.4.0 + 3.4.1 17 ${project.basedir}/../target/javadoc - 5.10.2 + 5.12.1 3.7.1 - 3.6.1 - 3.5.2 + 3.8.1 + 3.6.0 17 false - 3.9.6 + 3.9.9 5.10.0 - 1.0.0 + 1.1.0 - 2024-01-08T16:47:56Z - 3.25.3 - 2.0.12 + 2025-01-05T19:25:27Z + 3.25.5 + 2.0.17 2.8.1 - 3.0.0-M5 + 3.5.2 ${project.build.directory}/testing-tmp - 1.5.6-2 + 1.5.7-2 @@ -98,7 +98,7 @@ org.apache.orc orc-shims - 2.1.0-SNAPSHOT + 2.2.0-SNAPSHOT org.apache.hadoop @@ -113,17 +113,17 @@ org.apache.orc orc-core - 2.1.0-SNAPSHOT + 2.2.0-SNAPSHOT org.apache.orc orc-mapreduce - 2.1.0-SNAPSHOT + 2.2.0-SNAPSHOT org.apache.orc orc-tools - 2.1.0-SNAPSHOT + 2.2.0-SNAPSHOT @@ -135,7 +135,7 @@ com.google.code.gson gson - 2.9.0 + 2.13.0 com.google.protobuf @@ -145,28 +145,23 @@ commons-cli commons-cli - 1.6.0 + 1.9.0 org.apache.commons commons-lang3 - 3.14.0 + 3.17.0 io.airlift aircompressor - 0.26 + 2.0.2 com.github.luben zstd-jni ${zstd-jni.version} - - org.apache.commons - commons-csv - 1.10.0 - org.apache.hadoop hadoop-client-api @@ -208,7 +203,7 @@ org.threeten threeten-extra - 1.7.1 + 1.8.0 com.aayushatharva.brotli4j @@ -221,7 +216,7 @@ com.google.guava guava - 33.1.0-jre + 33.4.0-jre test @@ -251,19 +246,19 @@ org.objenesis objenesis - 3.2 + 3.3 test net.bytebuddy byte-buddy - 1.14.11 + 1.17.0 test net.bytebuddy byte-buddy-agent - 1.14.11 + 1.17.0 test @@ -340,7 +335,7 @@ com.diffplug.spotless spotless-maven-plugin - 2.43.0 + 2.44.4 @@ -375,7 +370,7 @@ org.apache.maven.plugins maven-jar-plugin - 3.3.0 + 3.4.2 @@ -395,7 +390,7 @@ com.github.spotbugs spotbugs-maven-plugin - 4.8.3.0 + 4.9.3.0 spotbugs-include.xml spotbugs-exclude.xml @@ -436,6 +431,7 @@ .idea/** **/*.iml **/dependency-reduced-pom.xml + .mvn/jvm.config @@ -450,7 +446,7 @@ org.apache.maven.plugins maven-checkstyle-plugin - 3.3.1 + 3.6.0 ${basedir}/src/java @@ -495,7 +491,7 @@ org.codehaus.mojo build-helper-maven-plugin - 3.5.0 + 3.6.0 add-source @@ -600,7 +596,7 @@ io.github.zlika reproducible-build-maven-plugin - 0.16 + 0.17 @@ -608,12 +604,12 @@ org.apache.maven.plugins maven-enforcer-plugin - 3.4.0 + 3.5.0 org.codehaus.mojo extra-enforcer-rules - 1.8.0 + 1.10.0 diff --git a/java/shims/pom.xml b/java/shims/pom.xml index 5e9231b970..fb3a4e33c6 100644 --- a/java/shims/pom.xml +++ b/java/shims/pom.xml @@ -17,7 +17,7 @@ org.apache.orc orc - 2.1.0-SNAPSHOT + 2.2.0-SNAPSHOT ../pom.xml diff --git a/java/shims/src/java/org/apache/orc/impl/HadoopShims.java b/java/shims/src/java/org/apache/orc/impl/HadoopShims.java index 2ae0364f25..f79f353647 100644 --- a/java/shims/src/java/org/apache/orc/impl/HadoopShims.java +++ b/java/shims/src/java/org/apache/orc/impl/HadoopShims.java @@ -20,7 +20,6 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.util.VersionInfo; import org.apache.orc.EncryptionAlgorithm; import java.io.Closeable; @@ -132,9 +131,9 @@ ByteBuffer readBuffer(int maxLength, */ boolean endVariableLengthBlock(OutputStream output) throws IOException; - default boolean supportVectoredIO() { + default boolean supportVectoredIO(String version) { // HADOOP-18103 is available since Apache Hadoop 3.3.5+ - String[] versionParts = VersionInfo.getVersion().split("[.]"); + String[] versionParts = version.split("[.-]"); int major = Integer.parseInt(versionParts[0]); int minor = Integer.parseInt(versionParts[1]); int patch = Integer.parseInt(versionParts[2]); diff --git a/java/shims/src/test/org/apache/orc/impl/TestHadoopShimsPost3_3_4.java b/java/shims/src/test/org/apache/orc/impl/TestHadoopShimsPost3_3_4.java new file mode 100644 index 0000000000..774dac3c24 --- /dev/null +++ b/java/shims/src/test/org/apache/orc/impl/TestHadoopShimsPost3_3_4.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.impl; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +public class TestHadoopShimsPost3_3_4 { + + @Test + public void testOlderVersionForSupportVectoredIO() { + assertFalse(new HadoopShimsCurrent().supportVectoredIO("3.3.4")); + } + + @Test + public void testSupportedVersionForSupportVectoredIO() { + assertTrue(new HadoopShimsCurrent().supportVectoredIO("3.3.5")); + } + + @Test + public void testExtendedSemanticVersionForSupportVectoredIO() { + assertTrue(new HadoopShimsCurrent().supportVectoredIO("3.3.6-co-3")); + } +} diff --git a/java/tools/pom.xml b/java/tools/pom.xml index cc7cdd34f6..c8fee63de5 100644 --- a/java/tools/pom.xml +++ b/java/tools/pom.xml @@ -17,7 +17,7 @@ org.apache.orc orc - 2.1.0-SNAPSHOT + 2.2.0-SNAPSHOT ../pom.xml @@ -48,7 +48,7 @@ com.opencsv opencsv - 5.9 + 5.10 commons-beanutils @@ -60,10 +60,6 @@ commons-cli commons-cli - - org.apache.commons - commons-lang3 - org.apache.hive hive-storage-api diff --git a/java/tools/src/java/org/apache/orc/tools/FileDump.java b/java/tools/src/java/org/apache/orc/tools/FileDump.java index c235053106..b8acb1caed 100644 --- a/java/tools/src/java/org/apache/orc/tools/FileDump.java +++ b/java/tools/src/java/org/apache/orc/tools/FileDump.java @@ -22,7 +22,6 @@ import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; -import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; @@ -61,7 +60,7 @@ */ public final class FileDump { public static final String UNKNOWN = "UNKNOWN"; - public static final String SEPARATOR = StringUtils.repeat("_", 120) + "\n"; + public static final String SEPARATOR = "_".repeat(120) + "\n"; public static final String RECOVER_READ_SIZE = "orc.recover.read.size"; // only for testing public static final int DEFAULT_BLOCK_SIZE = 256 * 1024 * 1024; public static final String DEFAULT_BACKUP_PATH = System.getProperty("java.io.tmpdir"); @@ -134,7 +133,9 @@ public static void main(Configuration conf, String[] args) throws Exception { boolean prettyPrint = cli.hasOption('p'); JsonFileDump.printJsonMetaData(filesInPath, conf, rowIndexCols, prettyPrint, printTimeZone); } else { - printMetaData(filesInPath, conf, rowIndexCols, printTimeZone, recover, backupPath); + boolean printColumnType = cli.hasOption("column-type"); + printMetaData(filesInPath, conf, rowIndexCols, printTimeZone, recover, backupPath, + printColumnType); } } } @@ -268,11 +269,11 @@ public static Collection getAllFilesInPath(final Path path, private static void printMetaData(List files, Configuration conf, List rowIndexCols, boolean printTimeZone, final boolean recover, - final String backupPath) + final String backupPath, final boolean printColumnType) throws IOException { List corruptFiles = new ArrayList<>(); for (String filename : files) { - printMetaDataImpl(filename, conf, rowIndexCols, printTimeZone, corruptFiles); + printMetaDataImpl(filename, conf, rowIndexCols, printTimeZone, corruptFiles, printColumnType); System.out.println(SEPARATOR); } @@ -294,6 +295,15 @@ private static void printMetaData(List files, Configuration conf, } } + static void printColumnsType(TypeDescription schema) { + int maximumId = schema.getMaximumId(); + for (int c = schema.getId(); c < maximumId + 1; ++c) { + TypeDescription type = schema.findSubtype(c); + System.out.println(" Column " + type.getId() + ": field: " + type.getFullFieldName() + + " type: " + type.toString()); + } + } + static void printTypeAnnotations(TypeDescription type, String prefix) { List attributes = type.getAttributeNames(); if (attributes.size() > 0) { @@ -329,7 +339,7 @@ static void printTypeAnnotations(TypeDescription type, String prefix) { private static void printMetaDataImpl(final String filename, final Configuration conf, List rowIndexCols, final boolean printTimeZone, - final List corruptFiles) throws IOException { + final List corruptFiles, final boolean printColumnType) throws IOException { Path file = new Path(filename); Reader reader = getReader(file, conf, corruptFiles); // if we can create reader then footer is not corrupt and file will readable @@ -351,15 +361,20 @@ private static void printMetaDataImpl(final String filename, ? "Proleptic Gregorian" : "Julian/Gregorian")); System.out.println("Type: " + reader.getSchema().toString()); + if (printColumnType) { + System.out.println("Columns type:"); + printColumnsType(reader.getSchema()); + } printTypeAnnotations(reader.getSchema(), "root"); System.out.println("\nStripe Statistics:"); List stripeStats = reader.getStripeStatistics(); for (int n = 0; n < stripeStats.size(); n++) { System.out.println(" Stripe " + (n + 1) + ":"); StripeStatistics ss = stripeStats.get(n); - for (int i = 0; i < ss.getColumnStatistics().length; ++i) { + ColumnStatistics[] columnStatistics = ss.getColumnStatistics(); + for (int i = 0; i < columnStatistics.length; ++i) { System.out.println(" Column " + i + ": " + - ss.getColumnStatistics()[i].toString()); + columnStatistics[i].toString()); } } ColumnStatistics[] stats = reader.getStatistics(); @@ -834,6 +849,11 @@ static Options createOptions() { .desc("specify a backup path to store the corrupted files (default: /tmp)") .hasArg() .build()); + + result.addOption(Option.builder() + .longOpt("column-type") + .desc("Print the column id, name and type of each column") + .build()); return result; } diff --git a/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java b/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java index d6166ea91d..7d893a54c4 100644 --- a/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java +++ b/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java @@ -49,6 +49,8 @@ import java.io.IOException; import java.io.StringWriter; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; @@ -110,10 +112,11 @@ public static void printJsonMetaData(List files, writer.name("stripeNumber").value(n + 1); StripeStatistics ss = stripeStatistics.get(n); writer.name("columnStatistics").beginArray(); - for (int i = 0; i < ss.getColumnStatistics().length; i++) { + ColumnStatistics[] columnStatistics = ss.getColumnStatistics(); + for (int i = 0; i < columnStatistics.length; i++) { writer.beginObject(); writer.name("columnId").value(i); - writeColumnStatistics(writer, ss.getColumnStatistics()[i]); + writeColumnStatistics(writer, columnStatistics[i]); writer.endObject(); } writer.endArray(); @@ -222,6 +225,17 @@ public static void printJsonMetaData(List files, writer.name("numDeletes").value(acidStats.deletes); writer.name("numUpdates").value(acidStats.updates); } + List keys = reader.getMetadataKeys(); + keys.remove(OrcAcidUtils.ACID_STATS); + if (!keys.isEmpty()) { + writer.name("userMetadata").beginObject(); + for (String key : keys) { + writer.name(key); + ByteBuffer byteBuffer = reader.getMetadataValue(key); + writer.value(String.valueOf(StandardCharsets.UTF_8.decode(byteBuffer))); + } + writer.endObject(); + } writer.name("status").value("OK"); rows.close(); diff --git a/java/tools/src/java/org/apache/orc/tools/PrintData.java b/java/tools/src/java/org/apache/orc/tools/PrintData.java index 37a7209421..6c7c18ba15 100644 --- a/java/tools/src/java/org/apache/orc/tools/PrintData.java +++ b/java/tools/src/java/org/apache/orc/tools/PrintData.java @@ -238,6 +238,7 @@ private static Options getOptions() { .build(); Option linesOpt = Option.builder("n").longOpt("lines") .argName("LINES") + .desc("Sets lines of data to be printed") .hasArg() .build(); diff --git a/java/tools/src/java/org/apache/orc/tools/convert/ConvertTool.java b/java/tools/src/java/org/apache/orc/tools/convert/ConvertTool.java index fbdb8696de..f7e9bb1054 100644 --- a/java/tools/src/java/org/apache/orc/tools/convert/ConvertTool.java +++ b/java/tools/src/java/org/apache/orc/tools/convert/ConvertTool.java @@ -199,7 +199,7 @@ public ConvertTool(Configuration conf, this.csvHeaderLines = getIntOption(opts, 'H', 0); this.csvNullString = opts.getOptionValue('n', ""); this.timestampFormat = opts.getOptionValue("t", DEFAULT_TIMESTAMP_FORMAT); - this.bloomFilterColumns = opts.getOptionValue('b', null); + this.bloomFilterColumns = opts.getOptionValue('b'); this.unionTag = opts.getOptionValue("union-tag", "tag"); this.unionValue = opts.getOptionValue("union-value", "value"); String outFilename = opts.hasOption('o') diff --git a/java/tools/src/java/org/apache/orc/tools/json/JsonSchemaFinder.java b/java/tools/src/java/org/apache/orc/tools/json/JsonSchemaFinder.java index 358eb21a5d..7a07650493 100644 --- a/java/tools/src/java/org/apache/orc/tools/json/JsonSchemaFinder.java +++ b/java/tools/src/java/org/apache/orc/tools/json/JsonSchemaFinder.java @@ -29,7 +29,6 @@ import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; -import org.apache.commons.lang3.exception.ExceptionUtils; import org.apache.hadoop.conf.Configuration; import org.apache.orc.TypeDescription; import org.apache.orc.TypeDescriptionPrettyPrint; @@ -40,6 +39,8 @@ import java.io.IOException; import java.io.InputStreamReader; import java.io.PrintStream; +import java.io.PrintWriter; +import java.io.StringWriter; import java.math.BigInteger; import java.nio.charset.StandardCharsets; import java.util.List; @@ -264,6 +265,15 @@ public void addFile(java.io.Reader reader, String filename) { } } + public static String getStackTrace(final Throwable throwable) { + if (throwable == null) { + return ""; + } + final StringWriter sw = new StringWriter(); + throwable.printStackTrace(new PrintWriter(sw, true)); + return sw.toString(); + } + private void printParseExceptionMsg(JsonParseException e, String filename) { System.err.printf( "A JsonParseException was thrown while processing the %dth record of file %s.%n", @@ -282,7 +292,7 @@ private void printParseExceptionMsg(JsonParseException e, String filename) { System.exit(1); } } - System.err.printf("Please check the file.%n%n%s%n", ExceptionUtils.getStackTrace(e)); + System.err.printf("Please check the file.%n%n%s%n", getStackTrace(e)); System.exit(1); } diff --git a/java/tools/src/test/org/apache/orc/tools/TestFileDump.java b/java/tools/src/test/org/apache/orc/tools/TestFileDump.java index c265a7400e..2699abf402 100644 --- a/java/tools/src/test/org/apache/orc/tools/TestFileDump.java +++ b/java/tools/src/test/org/apache/orc/tools/TestFileDump.java @@ -22,6 +22,7 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DateColumnVector; import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector; @@ -58,6 +59,7 @@ import java.nio.file.Paths; import java.sql.Timestamp; import java.text.SimpleDateFormat; +import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; @@ -827,6 +829,74 @@ public void testDoubleNaNAndInfinite() throws Exception { assertEquals("{\"x\":12.34}", lines[2]); } + @Test + public void testDumpColumnType() throws Exception { + TypeDescription schema = + TypeDescription.fromString("struct"); + Writer writer = OrcFile.createWriter(testFilePath, + OrcFile.writerOptions(conf) + .fileSystem(fs) + .setSchema(schema)); + + VectorizedRowBatch batch = schema.createRowBatch(); + LongColumnVector a = (LongColumnVector) batch.cols[0]; + LongColumnVector b = (LongColumnVector) batch.cols[1]; + LongColumnVector c = (LongColumnVector) batch.cols[2]; + LongColumnVector d = (LongColumnVector) batch.cols[3]; + LongColumnVector e = (LongColumnVector) batch.cols[4]; + DoubleColumnVector f = (DoubleColumnVector) batch.cols[5]; + DoubleColumnVector g = (DoubleColumnVector) batch.cols[6]; + BytesColumnVector h = (BytesColumnVector) batch.cols[7]; + DateColumnVector i = (DateColumnVector) batch.cols[8]; + TimestampColumnVector j = (TimestampColumnVector) batch.cols[9]; + BytesColumnVector k = (BytesColumnVector) batch.cols[10]; + DecimalColumnVector l = (DecimalColumnVector) batch.cols[11]; + BytesColumnVector m = (BytesColumnVector) batch.cols[12]; + BytesColumnVector n = (BytesColumnVector) batch.cols[13]; + + for (int o = 0; o < VectorizedRowBatch.DEFAULT_SIZE * 2; o++) { + int row = batch.size++; + a.vector[row] = row % 2; + b.vector[row] = row % 128; + c.vector[row] = row; + d.vector[row] = row; + e.vector[row] = row * 10000000L; + f.vector[row] = row * 1.0f; + g.vector[row] = row * 1.0d; + byte[] bytes = String.valueOf(row).getBytes(StandardCharsets.UTF_8); + h.setRef(row, bytes, 0, bytes.length); + i.vector[row] = row; + j.time[row] = row * 1000L; + j.nanos[row] = row; + k.setRef(row, bytes, 0, bytes.length); + l.vector[row] = new HiveDecimalWritable(row); + m.setRef(row, bytes, 0, bytes.length); + bytes = String.valueOf(10000 - row).getBytes(StandardCharsets.UTF_8); + n.setRef(row, bytes, 0, bytes.length); + + if (batch.size == batch.getMaxSize()) { + writer.addRowBatch(batch); + batch.reset(); + } + } + writer.close(); + assertEquals(VectorizedRowBatch.DEFAULT_SIZE * 2, writer.getNumberOfRows()); + + PrintStream origOut = System.out; + String outputFilename = "orc-file-dump-column-type.out"; + FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename); + + // replace stdout and run command + System.setOut(new PrintStream(myOut, false, StandardCharsets.UTF_8.toString())); + FileDump.main(new String[]{testFilePath.toString(), "--column-type"}); + System.out.flush(); + System.setOut(origOut); + + checkOutput(outputFilename, workDir + File.separator + outputFilename); + } + private static boolean contentEquals(String filePath, String otherFilePath) throws IOException { try (InputStream is = new BufferedInputStream(new FileInputStream(filePath)); InputStream otherIs = new BufferedInputStream(new FileInputStream(otherFilePath))) { diff --git a/java/tools/src/test/org/apache/orc/tools/TestJsonFileDump.java b/java/tools/src/test/org/apache/orc/tools/TestJsonFileDump.java index 225d7c34d0..0ffbea7033 100644 --- a/java/tools/src/test/org/apache/orc/tools/TestJsonFileDump.java +++ b/java/tools/src/test/org/apache/orc/tools/TestJsonFileDump.java @@ -117,6 +117,10 @@ public void testJsonDump() throws Exception { writer.addRowBatch(batch); } + writer.addUserMetadata("hive.acid.key.index", + StandardCharsets.UTF_8.encode("1,1,1;2,3,5;")); + writer.addUserMetadata("some.user.property", + StandardCharsets.UTF_8.encode("foo#bar$baz&")); writer.close(); PrintStream origOut = System.out; String outputFilename = "orc-file-dump.json"; diff --git a/java/tools/src/test/org/apache/orc/tools/TesScanData.java b/java/tools/src/test/org/apache/orc/tools/TestScanData.java similarity index 94% rename from java/tools/src/test/org/apache/orc/tools/TesScanData.java rename to java/tools/src/test/org/apache/orc/tools/TestScanData.java index df73abc900..e9042a4037 100644 --- a/java/tools/src/test/org/apache/orc/tools/TesScanData.java +++ b/java/tools/src/test/org/apache/orc/tools/TestScanData.java @@ -36,7 +36,7 @@ import static org.junit.jupiter.api.Assertions.assertTrue; -public class TesScanData { +public class TestScanData { private Path workDir = new Path(System.getProperty("test.tmp.dir")); private Configuration conf; private FileSystem fs; @@ -47,7 +47,7 @@ public void openFileSystem() throws Exception { conf = new Configuration(); fs = FileSystem.getLocal(conf); fs.setWorkingDirectory(workDir); - testFilePath = new Path("TesScanData.testScan.orc"); + testFilePath = new Path("TestScanData.testScan.orc"); fs.delete(testFilePath, false); } @@ -86,6 +86,6 @@ public void testScan() throws Exception { assertTrue(output.contains("{\"category\": \"struct\", \"id\": 0, \"max\": 2, \"fields\": [\n" + "{ \"x\": {\"category\": \"int\", \"id\": 1, \"max\": 1}},\n" + "{ \"y\": {\"category\": \"string\", \"id\": 2, \"max\": 2}}]}")); - assertTrue(output.contains("File: TesScanData.testScan.orc, bad batches: 0, rows: 10000/10000")); + assertTrue(output.contains("File: TestScanData.testScan.orc, bad batches: 0, rows: 10000/10000")); } } diff --git a/java/tools/src/test/resources/orc-file-dump-column-type.out b/java/tools/src/test/resources/orc-file-dump-column-type.out new file mode 100644 index 0000000000..73267e7287 --- /dev/null +++ b/java/tools/src/test/resources/orc-file-dump-column-type.out @@ -0,0 +1,121 @@ +Structure for TestFileDump.testDump.orc +File Version: 0.12 with ORC_14 by ORC Java 2.1.0-SNAPSHOT +Rows: 2048 +Compression: ZSTD +Compression size: 262144 +Calendar: Julian/Gregorian +Type: struct +Columns type: + Column 0: field: 0 type: struct + Column 1: field: a type: boolean + Column 2: field: b type: tinyint + Column 3: field: c type: smallint + Column 4: field: d type: int + Column 5: field: e type: bigint + Column 6: field: f type: float + Column 7: field: g type: double + Column 8: field: h type: string + Column 9: field: i type: date + Column 10: field: j type: timestamp + Column 11: field: k type: binary + Column 12: field: l type: decimal(20,5) + Column 13: field: m type: varchar(5) + Column 14: field: n type: char(5) + +Stripe Statistics: + Stripe 1: + Column 0: count: 2048 hasNull: false + Column 1: count: 2048 hasNull: false bytesOnDisk: 7 true: 1024 + Column 2: count: 2048 hasNull: false bytesOnDisk: 152 min: 0 max: 127 sum: 130048 + Column 3: count: 2048 hasNull: false bytesOnDisk: 21 min: 0 max: 1023 sum: 1047552 + Column 4: count: 2048 hasNull: false bytesOnDisk: 21 min: 0 max: 1023 sum: 1047552 + Column 5: count: 2048 hasNull: false bytesOnDisk: 35 min: 0 max: 10230000000 sum: 10475520000000 + Column 6: count: 2048 hasNull: false bytesOnDisk: 2361 min: 0.0 max: 1023.0 sum: 1047552.0 + Column 7: count: 2048 hasNull: false bytesOnDisk: 973 min: 0.0 max: 1023.0 sum: 1047552.0 + Column 8: count: 2048 hasNull: false bytesOnDisk: 2988 min: 0 max: 999 sum: 5972 + Column 9: count: 2048 hasNull: false bytesOnDisk: 21 min: Hybrid AD 1970-01-01 max: Hybrid AD 1972-10-20 + Column 10: count: 2048 hasNull: false bytesOnDisk: 1626 min: 1969-12-31 16:00:00.0 max: 1969-12-31 16:17:03.000001023 + Column 11: count: 2048 hasNull: false bytesOnDisk: 1404 sum: 5972 + Column 12: count: 2048 hasNull: false bytesOnDisk: 1666 min: 0 max: 1023 sum: 1047552 + Column 13: count: 2048 hasNull: false bytesOnDisk: 2988 min: 0 max: 999 sum: 5972 + Column 14: count: 2048 hasNull: false bytesOnDisk: 1277 min: 10000 max: 9999 sum: 10240 + +File Statistics: + Column 0: count: 2048 hasNull: false + Column 1: count: 2048 hasNull: false bytesOnDisk: 7 true: 1024 + Column 2: count: 2048 hasNull: false bytesOnDisk: 152 min: 0 max: 127 sum: 130048 + Column 3: count: 2048 hasNull: false bytesOnDisk: 21 min: 0 max: 1023 sum: 1047552 + Column 4: count: 2048 hasNull: false bytesOnDisk: 21 min: 0 max: 1023 sum: 1047552 + Column 5: count: 2048 hasNull: false bytesOnDisk: 35 min: 0 max: 10230000000 sum: 10475520000000 + Column 6: count: 2048 hasNull: false bytesOnDisk: 2361 min: 0.0 max: 1023.0 sum: 1047552.0 + Column 7: count: 2048 hasNull: false bytesOnDisk: 973 min: 0.0 max: 1023.0 sum: 1047552.0 + Column 8: count: 2048 hasNull: false bytesOnDisk: 2988 min: 0 max: 999 sum: 5972 + Column 9: count: 2048 hasNull: false bytesOnDisk: 21 min: Hybrid AD 1970-01-01 max: Hybrid AD 1972-10-20 + Column 10: count: 2048 hasNull: false bytesOnDisk: 1626 min: 1969-12-31 16:00:00.0 max: 1969-12-31 16:17:03.000001023 + Column 11: count: 2048 hasNull: false bytesOnDisk: 1404 sum: 5972 + Column 12: count: 2048 hasNull: false bytesOnDisk: 1666 min: 0 max: 1023 sum: 1047552 + Column 13: count: 2048 hasNull: false bytesOnDisk: 2988 min: 0 max: 999 sum: 5972 + Column 14: count: 2048 hasNull: false bytesOnDisk: 1277 min: 10000 max: 9999 sum: 10240 + +Stripes: + Stripe: offset: 3 data: 15540 rows: 2048 tail: 225 index: 464 + Stream: column 0 section ROW_INDEX start: 3 length 12 + Stream: column 1 section ROW_INDEX start: 15 length 24 + Stream: column 2 section ROW_INDEX start: 39 length 28 + Stream: column 3 section ROW_INDEX start: 67 length 28 + Stream: column 4 section ROW_INDEX start: 95 length 28 + Stream: column 5 section ROW_INDEX start: 123 length 35 + Stream: column 6 section ROW_INDEX start: 158 length 45 + Stream: column 7 section ROW_INDEX start: 203 length 45 + Stream: column 8 section ROW_INDEX start: 248 length 30 + Stream: column 9 section ROW_INDEX start: 278 length 24 + Stream: column 10 section ROW_INDEX start: 302 length 35 + Stream: column 11 section ROW_INDEX start: 337 length 24 + Stream: column 12 section ROW_INDEX start: 361 length 39 + Stream: column 13 section ROW_INDEX start: 400 length 30 + Stream: column 14 section ROW_INDEX start: 430 length 37 + Stream: column 1 section DATA start: 467 length 7 + Stream: column 2 section DATA start: 474 length 152 + Stream: column 3 section DATA start: 626 length 21 + Stream: column 4 section DATA start: 647 length 21 + Stream: column 5 section DATA start: 668 length 35 + Stream: column 6 section DATA start: 703 length 2361 + Stream: column 7 section DATA start: 3064 length 973 + Stream: column 8 section DATA start: 4037 length 1575 + Stream: column 8 section LENGTH start: 5612 length 47 + Stream: column 8 section DICTIONARY_DATA start: 5659 length 1366 + Stream: column 9 section DATA start: 7025 length 21 + Stream: column 10 section DATA start: 7046 length 35 + Stream: column 10 section SECONDARY start: 7081 length 1591 + Stream: column 11 section DATA start: 8672 length 1368 + Stream: column 11 section LENGTH start: 10040 length 36 + Stream: column 12 section DATA start: 10076 length 1647 + Stream: column 12 section SECONDARY start: 11723 length 19 + Stream: column 13 section DATA start: 11742 length 1575 + Stream: column 13 section LENGTH start: 13317 length 47 + Stream: column 13 section DICTIONARY_DATA start: 13364 length 1366 + Stream: column 14 section DATA start: 14730 length 753 + Stream: column 14 section LENGTH start: 15483 length 11 + Stream: column 14 section DICTIONARY_DATA start: 15494 length 513 + Encoding column 0: DIRECT + Encoding column 1: DIRECT + Encoding column 2: DIRECT + Encoding column 3: DIRECT_V2 + Encoding column 4: DIRECT_V2 + Encoding column 5: DIRECT_V2 + Encoding column 6: DIRECT + Encoding column 7: DIRECT + Encoding column 8: DICTIONARY_V2[1024] + Encoding column 9: DIRECT_V2 + Encoding column 10: DIRECT_V2 + Encoding column 11: DIRECT_V2 + Encoding column 12: DIRECT_V2 + Encoding column 13: DICTIONARY_V2[1024] + Encoding column 14: DICTIONARY_V2[1024] + +File length: 16919 bytes +File raw data size: 1048404 bytes +Padding length: 0 bytes +Padding ratio: 0% +________________________________________________________________________________________________________________________ + diff --git a/java/tools/src/test/resources/orc-file-dump.json b/java/tools/src/test/resources/orc-file-dump.json index d94c59bb6a..15fdba74a8 100644 --- a/java/tools/src/test/resources/orc-file-dump.json +++ b/java/tools/src/test/resources/orc-file-dump.json @@ -2,7 +2,7 @@ "fileName": "TestFileDump.testDump.orc", "fileVersion": "0.12", "writerVersion": "ORC_14", - "softwareVersion": "ORC Java 1.9.0-SNAPSHOT", + "softwareVersion": "ORC Java 2.1.0-SNAPSHOT", "numberOfRows": 21000, "compression": "ZLIB", "compressionBufferSize": 4096, @@ -461,48 +461,48 @@ "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 }, { "entryId": 1, "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 }, { "entryId": 2, "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 }, { "entryId": 3, "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 }, { "entryId": 4, "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 } ], "stripeLevelBloomFilter": { "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 } } ] @@ -704,48 +704,48 @@ "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 }, { "entryId": 1, "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 }, { "entryId": 2, "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 }, { "entryId": 3, "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 }, { "entryId": 4, "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 } ], "stripeLevelBloomFilter": { "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 } } ] @@ -947,48 +947,48 @@ "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 }, { "entryId": 1, "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 }, { "entryId": 2, "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 }, { "entryId": 3, "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 }, { "entryId": 4, "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 } ], "stripeLevelBloomFilter": { "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 } } ] @@ -1190,48 +1190,48 @@ "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 }, { "entryId": 1, "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 }, { "entryId": 2, "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 }, { "entryId": 3, "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 }, { "entryId": 4, "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 } ], "stripeLevelBloomFilter": { "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 } } ] @@ -1361,16 +1361,16 @@ "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 } ], "stripeLevelBloomFilter": { "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 } } ] @@ -1380,5 +1380,9 @@ "rawDataSize": 2144730, "paddingLength": 0, "paddingRatio": 0.0, + "userMetadata": { + "hive.acid.key.index": "1,1,1;2,3,5;", + "some.user.property": "foo#bar$baz&" + }, "status": "OK" } diff --git a/site/Dockerfile b/site/Dockerfile index ff0a613974..a2a26a285f 100644 --- a/site/Dockerfile +++ b/site/Dockerfile @@ -17,8 +17,11 @@ # ORC site builder # -FROM ubuntu:20.04 -LABEL maintainer="Apache ORC project " +FROM ubuntu:24.04 +LABEL org.opencontainers.image.authors="Apache ORC project " +LABEL org.opencontainers.image.licenses="Apache-2.0" +LABEL org.opencontainers.image.ref.name="Apache ORC site builder" +LABEL org.opencontainers.image.version="" RUN ln -fs /usr/share/zoneinfo/America/Los_Angeles /etc/localtime RUN apt-get update @@ -37,8 +40,8 @@ RUN gem install \ liquid \ listen \ rouge -RUN gem install jekyll -v 3.8.6 -RUN gem install github-pages +RUN gem install jekyll +RUN gem install -f github-pages RUN useradd -ms /bin/bash orc COPY . /home/orc/site diff --git a/site/Gemfile b/site/Gemfile index 1c529c9ce1..200c6ce7b2 100644 --- a/site/Gemfile +++ b/site/Gemfile @@ -1,3 +1,2 @@ source 'https://rubygems.org/' -gem 'rouge' -gem 'jekyll', "~> 3.8.3" +gem 'jekyll', "~> 4.3" diff --git a/site/_data/releases.yml b/site/_data/releases.yml index e4f0fc3600..c181ef17c3 100644 --- a/site/_data/releases.yml +++ b/site/_data/releases.yml @@ -1,16 +1,80 @@ # List the releases in reverse logical order # Only one release should be tagged latest +2.1.1: + date: 2025-03-06 + state: latest + tar: orc-2.1.1.tar.gz + signed-by: Dongjoon Hyun (34F0FC5C) + sha256: 15af8baeee322bab0298559a14a09cf8c14cf2008e35d8a78d3cc8a4c98d1e59 + known-issues: + +2.1.0: + date: 2025-01-09 + state: archived + tar: orc-2.1.0.tar.gz + signed-by: William Hyun (DECDFA29) + sha256: 1ffac0228aa83f04a1b1cf2788a3af5953e82587ae3a77c41900e99f2557132d + known-issues: + +2.0.4: + date: 2025-03-20 + state: stable + tar: orc-2.0.4.tar.gz + signed-by: Dongjoon Hyun (34F0FC5C) + sha256: 9525a76fae64a6da2a29adba36474c2ef863d042a394b78a9873d591649b7f0a + known-issues: + +2.0.3: + date: 2024-11-14 + state: archived + tar: orc-2.0.3.tar.gz + signed-by: Dongjoon Hyun (34F0FC5C) + sha256: 082cba862b5a8a0d14c225404d0b51cd8d1b64ca81b8f1e500322ce8922cb86d + known-issues: + +2.0.2: + date: 2024-08-15 + state: archived + tar: orc-2.0.2.tar.gz + signed-by: Dongjoon Hyun (34F0FC5C) + sha256: fabdee3e8acd64dae1e8b8987149a7188121b40b025de46d15cc9d0becee2279 + known-issues: + +2.0.1: + date: 2024-05-14 + state: archived + tar: orc-2.0.1.tar.gz + signed-by: William Hyun (DECDFA29) + sha256: 1ffac0228aa83f04a1b1cf2788a3af5953e82587ae3a77c41900e99f2557132d + known-issues: + 2.0.0: date: 2024-03-08 - state: latest + state: archived tar: orc-2.0.0.tar.gz signed-by: Dongjoon Hyun (34F0FC5C) sha256: 9107730919c29eb39efaff1b9e36166634d1d4d9477e5fee76bfd6a8fec317df known-issues: +1.9.5: + date: 2024-11-14 + state: stable + tar: orc-1.9.5.tar.gz + signed-by: Dongjoon Hyun (34F0FC5C) + sha256: 6900b4e8a2e4e49275f4067bd0f838ad68330204305fd3f13a5ec519e9d71547 + known-issues: + +1.9.4: + date: 2024-07-16 + state: archived + tar: orc-1.9.4.tar.gz + signed-by: William Hyun (DECDFA29) + sha256: d9a6bcc00e07a6e54d81ce380134e495ed0fc0d9dc1988d4d52125c9def097fd + known-issues: + 1.9.3: date: 2024-03-20 - state: stable + state: archived tar: orc-1.9.3.tar.gz signed-by: Gang Wu (578F619B) sha256: f737d005d0c4deb65688ac3c0223ed530b0ba6258552555b2774dcdb77359b0f @@ -40,9 +104,25 @@ sha256: 0dca8bbccdb2ee87e59ba964933436beebd02ea78c4134424828a8127fbc4faa known-issues: +1.8.8: + date: 2024-11-11 + state: stable + tar: orc-1.8.8.tar.gz + signed-by: Gang Wu (8A461DF4) + sha256: eca12a9139c0889d11ef1ecc8f273ccb0ef5d19df70d61cb732194d806db026b + known-issues: + +1.8.7: + date: 2024-04-14 + state: archived + tar: orc-1.8.7.tar.gz + signed-by: Dongjoon Hyun (34F0FC5C) + sha256: 57c9d12bf74b2752b1ce1039c15035c3b6f6531d865df962a99b3e079b3dfdb7 + known-issues: + 1.8.6: date: 2023-11-10 - state: stable + state: archived tar: orc-1.8.6.tar.gz signed-by: Dongjoon Hyun (34F0FC5C) sha256: 5675b18118df4dd7f86cc6ba859ed75b425ea1b7ddff805e1d671a17fd57d7f7 @@ -96,9 +176,17 @@ sha256: 859d78bfded98405c32ccb2847b565a57bcc53f473a74087c1f750aeb5932e62 known-issues: +1.7.11: + date: 2024-09-13 + state: archived + tar: orc-1.7.11.tar.gz + signed-by: Dongjoon Hyun (34F0FC5C) + sha256: ff62f0b882470529b3e2507daa4092ffdb34818c220abefb11cac443e5757236 + known-issues: + 1.7.10: date: 2023-11-10 - state: stable + state: archived tar: orc-1.7.10.tar.gz signed-by: Dongjoon Hyun (34F0FC5C) sha256: 85aef9368dc9bcdffaaf10010b66dfe053ce22f30b64854f63852248164686a3 diff --git a/site/_docs/building.md b/site/_docs/building.md index f1cc015eaa..4a57663ef9 100644 --- a/site/_docs/building.md +++ b/site/_docs/building.md @@ -11,8 +11,8 @@ The C++ library is supported on the following operating systems: * CentOS 7 * Debian 10 to 12 -* MacOS 12 to 14 -* Ubuntu 20.04 to 24.04 +* MacOS 13 to 15 +* Ubuntu 22.04 to 24.04 You'll want to install the usual set of developer tools, but at least: @@ -27,7 +27,6 @@ is in the docker subdirectory, for the list of packages required to build ORC: * [Debian 11]({{ page.dockerUrl }}/debian11/Dockerfile) * [Debian 12]({{ page.dockerUrl }}/debian12/Dockerfile) -* [Ubuntu 20]({{ page.dockerUrl }}/ubuntu20/Dockerfile) * [Ubuntu 22]({{ page.dockerUrl }}/ubuntu22/Dockerfile) * [Ubuntu 24]({{ page.dockerUrl }}/ubuntu24/Dockerfile) * [Fedora 37]({{ page.dockerUrl }}/fedora37/Dockerfile) diff --git a/site/_docs/dask.md b/site/_docs/dask.md index 7719e7d4cd..3699d59fd6 100644 --- a/site/_docs/dask.md +++ b/site/_docs/dask.md @@ -9,7 +9,7 @@ permalink: /docs/dask.html [Dask](https://dask.org) also supports Apache ORC. ``` -pip3 install "dask[dataframe]==2023.8.1" +pip3 install "dask[dataframe]==2024.12.1" pip3 install pandas ``` diff --git a/site/_docs/index.md b/site/_docs/index.md index 76addd410f..5d3e2ec2a9 100644 --- a/site/_docs/index.md +++ b/site/_docs/index.md @@ -37,4 +37,4 @@ are separated from each other so the reader can read just the columns that are required. For details on the specifics of the ORC format, please see the [ORC -format specification](/specification/). \ No newline at end of file +format specification]({{ site.baseurl }}/specification/). diff --git a/site/_docs/java-tools.md b/site/_docs/java-tools.md index f537201133..a3d546e007 100644 --- a/site/_docs/java-tools.md +++ b/site/_docs/java-tools.md @@ -142,6 +142,9 @@ equivalent to the Hive ORC File Dump command. `--backup-path ` : when used with --recover specifies the path where the recovered file is written (default: /tmp) +`--column-type` + : Print the column id, name and type of each column + `-d,--data` : Should the data be printed diff --git a/site/_docs/pyarrow.md b/site/_docs/pyarrow.md index fca23797fe..aa32c54a9d 100644 --- a/site/_docs/pyarrow.md +++ b/site/_docs/pyarrow.md @@ -9,7 +9,7 @@ permalink: /docs/pyarrow.html [Apache Arrow](https://arrow.apache.org) project's [PyArrow](https://pypi.org/project/pyarrow/) is the recommended package. ``` -pip3 install pyarrow==13.0.0 +pip3 install pyarrow==18.1.0 pip3 install pandas ``` diff --git a/site/_includes/docs_ul.html b/site/_includes/docs_ul.html index 8e93fee854..a11fdbadb8 100644 --- a/site/_includes/docs_ul.html +++ b/site/_includes/docs_ul.html @@ -12,7 +12,7 @@ {% for p in site.docs %} {% if p.url == item_url %} -
  • {{ p.title }}
  • +
  • {{ p.title }}
  • {% break %} {% endif %} {% endfor %} diff --git a/site/_includes/header.html b/site/_includes/header.html index e6e4721cf9..04d5ebde21 100644 --- a/site/_includes/header.html +++ b/site/_includes/header.html @@ -5,9 +5,9 @@
    diff --git a/site/_includes/news_contents.html b/site/_includes/news_contents.html index 2748456741..85546b49c5 100644 --- a/site/_includes/news_contents.html +++ b/site/_includes/news_contents.html @@ -2,17 +2,17 @@