diff --git a/.asf.yaml b/.asf.yaml index 14178a61c8..0b09389458 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# https://cwiki.apache.org/confluence/display/INFRA/git+-+.asf.yaml+features +# https://github.com/apache/infrastructure-asfyaml/blob/main/README.md --- github: description: "Apache ORC - the smallest, fastest columnar storage for Hadoop workloads" @@ -24,12 +24,17 @@ github: merge: false squash: true rebase: true + ghp_branch: main + ghp_path: /site labels: - apache - orc - java - cpp - big-data + protected_tags: + - "rel/*" + - "v*.*.*" notifications: pullrequests: issues@orc.apache.org issues: issues@orc.apache.org diff --git a/.clang-tidy b/.clang-tidy index bd995bca54..b401f8948b 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -21,13 +21,14 @@ Checks: "-*, CheckOptions: [ + { key: readability-identifier-naming.ParameterCase, value: "camelBack" }, + { key: readability-identifier-naming.PrivateMemberCase, value: "camelBack"}, { key: readability-identifier-naming.PrivateMemberSuffix, value: "_" }, { key: readability-identifier-naming.ProtectedMemberSuffix, value: "" }, { key: readability-identifier-naming.PublicMemberSuffix, value: "" }, - { key: readability-identifier-naming.ParameterCase, value: "camelBack" }, { key: readability-identifier-naming.ParameterIgnoredRegexp, value: "^[a-zA-Z]$" }, ] WarningsAsErrors: '' -HeaderFilterRegex: '.*' +HeaderFilterRegex: '(orc/c\+\+/|orc/tools)' FormatStyle: none \ No newline at end of file diff --git a/.github/.licenserc.yaml b/.github/.licenserc.yaml index a66db6601f..a16671e9d6 100644 --- a/.github/.licenserc.yaml +++ b/.github/.licenserc.yaml @@ -22,5 +22,6 @@ header: - 'NOTICE' - '.clang-format' - '.asf.yaml' + - '.nojekyll' comment: on-failure diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 8eddbcdea3..05a385618d 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -20,12 +20,9 @@ updates: schedule: interval: "weekly" ignore: - # Pin gson to 2.2.4 because of Hive + # Pin gson to 2.9.0 because of Hive - dependency-name: "com.google.code.gson:gson" - versions: "[2.3,)" + versions: "[2.9,1)" # Pin jodd-core to 3.5.2 - dependency-name: "org.jodd:jodd-core" versions: "[3.5.3,)" - # Pin annotations to 17.0.0 - - dependency-name: "org.jetbrains.annotations" - versions: "[17.0.1,)" diff --git a/.github/lsan-suppressions.txt b/.github/lsan-suppressions.txt new file mode 100644 index 0000000000..fc26ee8754 --- /dev/null +++ b/.github/lsan-suppressions.txt @@ -0,0 +1,21 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Add specific leak suppressions here if needed +# Format: +# leak:SymbolName +# leak:source_file.cc diff --git a/.github/workflows/asan_test.yml b/.github/workflows/asan_test.yml new file mode 100644 index 0000000000..6e7ac64fbb --- /dev/null +++ b/.github/workflows/asan_test.yml @@ -0,0 +1,69 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Address/Undefined Sanitizer Tests + +on: + pull_request: + paths-ignore: + - 'site/**' + - 'conan/**' + branches: + - main + +concurrency: + group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.event.number || github.sha }} + cancel-in-progress: true + +jobs: + asan-test: + name: "ASAN with ${{ matrix.compiler }} on Ubuntu" + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + compiler: [gcc, clang] + include: + - compiler: gcc + cc: gcc + cxx: g++ + - compiler: clang + cc: clang + cxx: clang++ + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y build-essential cmake libpthread-stubs0-dev + - name: Configure and Build with ASAN and UBSAN + env: + CC: ${{ matrix.cc }} + CXX: ${{ matrix.cxx }} + run: | + mkdir -p build && cd build + cmake .. -DCMAKE_BUILD_TYPE=Debug -DENABLE_ASAN=ON -DENABLE_UBSAN=ON -DBUILD_ENABLE_AVX512=ON -DBUILD_CPP_ENABLE_METRICS=ON -DBUILD_JAVA=OFF + make + - name: Run Tests + working-directory: build + env: + ASAN_OPTIONS: detect_leaks=1:symbolize=1:strict_string_checks=1:halt_on_error=0:detect_container_overflow=0 + LSAN_OPTIONS: suppressions=${{ github.workspace }}/.github/lsan-suppressions.txt + UBSAN_OPTIONS: print_stacktrace=1 + run: | + ctest --output-on-failure diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index b0350193ba..750dec550c 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + name: Build and test on: @@ -30,11 +47,12 @@ jobs: - debian11 - debian12 - ubuntu24 - - fedora37 + - oraclelinux8 - oraclelinux9 + - amazonlinux23 steps: - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: "Test" run: | cd docker @@ -47,11 +65,12 @@ jobs: fail-fast: false matrix: os: - - ubuntu-20.04 - ubuntu-22.04 - - macos-12 + - ubuntu-24.04 + - ubuntu-24.04-arm - macos-13 - macos-14 + - macos-15 java: - 17 - 21 @@ -61,71 +80,40 @@ jobs: - os: ubuntu-22.04 java: 17 cxx: g++ + - os: ubuntu-latest + java: 25-ea env: MAVEN_OPTS: -Xmx2g MAVEN_SKIP_RC: true steps: - name: Checkout - uses: actions/checkout@v2 - - name: Cache Maven local repository - uses: actions/cache@v2 - with: - path: ~/.m2/repository - key: ${{ matrix.java }}-maven-${{ hashFiles('**/pom.xml') }} - restore-keys: | - ${{ matrix.java }}-maven- + uses: actions/checkout@v4 - name: Install Java ${{ matrix.java }} - uses: actions/setup-java@v3 + uses: actions/setup-java@v4 with: distribution: zulu java-version: ${{ matrix.java }} + cache: 'maven' - name: "Test" run: | mkdir -p ~/.m2 - mkdir build - cd build - if [ "${{ matrix.os }}" = "ubuntu-20.04" ]; then - cmake -DANALYZE_JAVA=ON -DCMAKE_CXX_COMPILER=${{ matrix.cxx }} -DSTOP_BUILD_ON_WARNING=OFF .. + if [ "${{ matrix.java }}" = "25-ea" ]; then + cd java + # JDK 25 Build + ./mvnw package -DskipTests + # JDK 25 Test: shims, core, tools modules + ./mvnw package --pl tools --am else + mkdir build + cd build cmake -DANALYZE_JAVA=ON -DOPENSSL_ROOT_DIR=`brew --prefix openssl@1.1` .. + make package test-out fi - make package test-out - name: Step on failure if: ${{ failure() }} run: | cat /home/runner/work/orc/orc/build/java/rat.txt - windows: - name: "C++ ${{ matrix.simd }} Test on Windows" - runs-on: windows-2019 - strategy: - fail-fast: false - matrix: - simd: - - General - - AVX512 - env: - ORC_USER_SIMD_LEVEL: AVX512 - steps: - - name: Checkout - uses: actions/checkout@v2 - - name: Add msbuild to PATH - uses: microsoft/setup-msbuild@v1.1 - with: - msbuild-architecture: x64 - - name: "Test" - shell: bash - run: | - mkdir build - cd build - if [ "${{ matrix.simd }}" = "General" ]; then - cmake .. -G "Visual Studio 16 2019" -DCMAKE_BUILD_TYPE=Debug -DBUILD_LIBHDFSPP=OFF -DBUILD_TOOLS=OFF -DBUILD_JAVA=OFF - else - cmake .. -G "Visual Studio 16 2019" -DCMAKE_BUILD_TYPE=Debug -DBUILD_LIBHDFSPP=OFF -DBUILD_TOOLS=OFF -DBUILD_JAVA=OFF -DBUILD_ENABLE_AVX512=ON - fi - cmake --build . --config Debug - ctest -C Debug --output-on-failure - simdUbuntu: name: "SIMD programming using C++ intrinsic functions on ${{ matrix.os }}" runs-on: ${{ matrix.os }} @@ -140,7 +128,7 @@ jobs: ORC_USER_SIMD_LEVEL: AVX512 steps: - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: "Test" run: | mkdir -p ~/.m2 @@ -150,16 +138,25 @@ jobs: make package test-out doc: - name: "Javadoc generation" - runs-on: ubuntu-20.04 + name: "Markdown check and Javadoc generation" + runs-on: ubuntu-24.04 steps: - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Super-Linter + uses: super-linter/super-linter@12150456a73e248bdc94d0794898f94e23127c88 + env: + DEFAULT_BRANCH: main + VALIDATE_MARKDOWN: true + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Install Java 17 - uses: actions/setup-java@v3 + uses: actions/setup-java@v4 with: distribution: zulu java-version: 17 + cache: 'maven' - name: "javadoc" run: | mkdir -p ~/.m2 @@ -167,28 +164,40 @@ jobs: ./mvnw install -DskipTests ./mvnw javadoc:javadoc - formatting-check: - name: "C++ format check" - runs-on: ubuntu-20.04 - strategy: - matrix: - path: - - 'c++' - - 'tools' + cpp-linter: + runs-on: ubuntu-24.04 steps: - - uses: actions/checkout@v3 - - name: Run clang-format style check for C++ code - uses: jidicula/clang-format-action@v4.9.0 - with: - clang-format-version: '13' - check-path: ${{ matrix.path }} + - uses: actions/checkout@v4 + - name: Run build + run: | + mkdir build && cd build + cmake .. -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DBUILD_JAVA=OFF + cmake --build . + - uses: cpp-linter/cpp-linter-action@v2.13.3 + id: linter + continue-on-error: true + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + style: file + tidy-checks: file + files-changed-only: true + lines-changed-only: true + thread-comments: true + ignore: 'build|cmake_modules|conan|dev|docker|examples|java|site' + database: build + - name: Fail fast?! + if: steps.linter.outputs.checks-failed != 0 + run: | + echo "some linter checks failed. ${{ steps.linter.outputs.checks-failed }}" + exit 1 license-check: name: "License Check" runs-on: ubuntu-latest steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Check license header uses: apache/skywalking-eyes@main env: @@ -196,3 +205,53 @@ jobs: with: config: .github/.licenserc.yaml + macos-cpp-check: + name: "C++ Test on macOS" + strategy: + fail-fast: false + matrix: + version: [13, 14, 15] + runs-on: macos-${{ matrix.version }} + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Install dependencies + run: | + brew update + brew install protobuf + - name: Test + run: | + CMAKE_PREFIX_PATH=$(brew --prefix protobuf) + mkdir -p build + cd build + cmake .. -DBUILD_JAVA=OFF -DPROTOBUF_HOME=${CMAKE_PREFIX_PATH} + make package test-out + + meson: + name: "Meson C++ configuration" + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: + - ubuntu-22.04 + - ubuntu-24.04 + - ubuntu-24.04-arm + - macos-13 + - macos-14 + - macos-15 + steps: + - name: Checkout + uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.x' + - name: Install meson + run: | + pip install --upgrade pip + pip install meson + - name: Test + run: | + meson setup build -Dbuildtype=release + meson compile -C build + meson test -C build diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml new file mode 100644 index 0000000000..52b2e1fc7b --- /dev/null +++ b/.github/workflows/pages.yml @@ -0,0 +1,72 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +name: GitHub Pages deployment + +on: + push: + branches: + - main + +concurrency: + group: 'docs preview' + cancel-in-progress: false + +jobs: + docs: + name: Build and deploy documentation + runs-on: ubuntu-latest + permissions: + id-token: write + pages: write + environment: + name: github-pages # https://github.com/actions/deploy-pages/issues/271 + if: github.repository == 'apache/orc' + steps: + - name: Checkout ORC repository + uses: actions/checkout@v4 + with: + repository: apache/orc + ref: 'main' + - name: Install Java 17 + uses: actions/setup-java@v4 + with: + distribution: zulu + java-version: 17 + - name: Install Ruby for documentation generation + uses: ruby/setup-ruby@v1 + with: + ruby-version: '3.3' + bundler-cache: true + - name: Run documentation build + run: | + cd site + gem install bundler -n /usr/local/bin + bundle install --retry=100 + git clone https://github.com/apache/orc.git -b asf-site target + bundle exec jekyll build -b /orc + - name: Setup Pages + uses: actions/configure-pages@v5 + - name: Upload artifact + uses: actions/upload-pages-artifact@v3 + with: + path: 'site/target' + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 diff --git a/.github/workflows/publish_snapshot.yml b/.github/workflows/publish_snapshot.yml index 5a91bcbfc2..eb6d771238 100644 --- a/.github/workflows/publish_snapshot.yml +++ b/.github/workflows/publish_snapshot.yml @@ -10,7 +10,7 @@ jobs: if: github.repository == 'apache/orc' runs-on: ubuntu-latest steps: - - uses: actions/checkout@master + - uses: actions/checkout@v4 - uses: actions/setup-java@v3 with: diff --git a/.gitignore b/.gitignore index 2ff46e9694..3635e33bf2 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,6 @@ dependency-reduced-pom.xml java/bench/data *.swp .cache/* +subprojects/* +!subprojects/packagefiles +!subprojects/*.wrap diff --git a/.markdownlint.yaml b/.markdownlint.yaml new file mode 100644 index 0000000000..11c7a48ee6 --- /dev/null +++ b/.markdownlint.yaml @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +MD013: false diff --git a/.markdownlintignore b/.markdownlintignore new file mode 100644 index 0000000000..3953a04ce3 --- /dev/null +++ b/.markdownlintignore @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +site diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 0000000000..e69de29bb2 diff --git a/CMakeLists.txt b/CMakeLists.txt index 1fb0e755d6..9d036aa8e9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,10 +27,11 @@ project(ORC C CXX) # Version number of package SET(CPACK_PACKAGE_VERSION_MAJOR "2") -SET(CPACK_PACKAGE_VERSION_MINOR "1") +SET(CPACK_PACKAGE_VERSION_MINOR "3") SET(CPACK_PACKAGE_VERSION_PATCH "0-SNAPSHOT") SET(ORC_VERSION "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}") -set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PROJECT_SOURCE_DIR}/cmake_modules") +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # For clang-tidy. +list(PREPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules") option (BUILD_JAVA "Include ORC Java library in the build process" @@ -42,7 +43,7 @@ option (ANALYZE_JAVA option (BUILD_LIBHDFSPP "Include LIBHDFSPP library in the build process" - ON) + OFF) option(BUILD_CPP_TESTS "Build the googletest unit tests" @@ -76,10 +77,22 @@ option(BUILD_ENABLE_AVX512 "Enable build with AVX512 at compile time" OFF) +option(ENABLE_ASAN + "Enable Address Sanitizer" + OFF) + option(ORC_PACKAGE_KIND "Arbitrary string that identifies the kind of package" "") +option(ORC_ENABLE_CLANG_TOOLS + "Enable Clang tools" + OFF) + +option(ENABLE_UBSAN + "Enable Undefined Behavior Sanitizer" + OFF) + # Make sure that a build type is selected if (NOT CMAKE_BUILD_TYPE) message(STATUS "No build type selected, default to ReleaseWithDebugInfo") @@ -151,17 +164,38 @@ elseif (MSVC) set (WARN_FLAGS "${WARN_FLAGS} -wd4521") # multiple copy constructors specified set (WARN_FLAGS "${WARN_FLAGS} -wd4146") # unary minus operator applied to unsigned type, result still unsigned endif () +# Configure Address Sanitizer if enabled +if (ENABLE_ASAN) + if (CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address -fno-omit-frame-pointer") + message(STATUS "Address Sanitizer enabled") + else() + message(WARNING "Address Sanitizer is only supported for GCC and Clang compilers") + endif() +endif() -if (BUILD_CPP_ENABLE_METRICS) - message(STATUS "Enable the metrics collection") - add_compile_definitions(ENABLE_METRICS=1) -else () - message(STATUS "Disable the metrics collection") - add_compile_definitions(ENABLE_METRICS=0) -endif () +# Configure Undefined Behavior Sanitizer if enabled +if (ENABLE_UBSAN) + if (CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined -fno-sanitize=alignment,vptr,function -fno-sanitize-recover=all") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=undefined -fno-sanitize=alignment,vptr,function -fno-sanitize-recover=all") + message(STATUS "Undefined Behavior Sanitizer enabled") + else() + message(WARNING "Undefined Behavior Sanitizer is only supported for GCC and Clang compilers") + endif() +endif() enable_testing() +INCLUDE(GNUInstallDirs) # Put it before ThirdpartyToolchain to make CMAKE_INSTALL_LIBDIR available. + +if (ORC_PACKAGE_KIND STREQUAL "vcpkg") + set(ORC_INSTALL_CMAKE_DIR ${CMAKE_INSTALL_DATAROOTDIR}/orc) +else () + set(ORC_INSTALL_CMAKE_DIR ${CMAKE_INSTALL_LIBDIR}/cmake/orc) +endif () + INCLUDE(CheckSourceCompiles) INCLUDE(ThirdpartyToolchain) @@ -180,7 +214,7 @@ if (BUILD_ENABLE_AVX512 AND NOT APPLE) INCLUDE(ConfigSimdLevel) endif () -set (EXAMPLE_DIRECTORY ${CMAKE_SOURCE_DIR}/examples) +set (EXAMPLE_DIRECTORY ${PROJECT_SOURCE_DIR}/examples) add_subdirectory(c++) @@ -210,3 +244,7 @@ if (BUILD_CPP_TESTS) ) endif () endif () + +if (ORC_ENABLE_CLANG_TOOLS) + INCLUDE(CheckFormat) +endif () diff --git a/README.md b/README.md index 60b0da5fcb..2ddf0849b9 100644 --- a/README.md +++ b/README.md @@ -18,20 +18,21 @@ lists, maps, and unions. This project includes both a Java library and a C++ library for reading and writing the _Optimized Row Columnar_ (ORC) file format. The C++ and Java libraries are completely independent of each other and will each read all versions of ORC files. Releases: -* Latest: Apache ORC releases -* Maven Central: ![Maven Central](https://maven-badges.herokuapp.com/maven-central/org.apache.orc/orc/badge.svg) -* Downloads: Apache ORC downloads -* Release tags: Apache ORC release tags -* Plan: Apache ORC future release plan + +* Latest: [Apache ORC releases](https://orc.apache.org/releases) +* Maven Central: [![Maven Central](https://maven-badges.herokuapp.com/maven-central/org.apache.orc/orc/badge.svg)](https://search.maven.org/#search%7Cga%7C1%7Cg%3A%22org.apache.orc%22) +* Downloads: [Apache ORC downloads](https://orc.apache.org/downloads) +* Release tags: [Apache ORC release tags](https://github.com/apache/orc/releases) +* Plan: [Apache ORC future release plan](https://github.com/apache/orc/milestones) The current build status: -* Main branch -![main build status](https://github.com/apache/orc/actions/workflows/build_and_test.yml/badge.svg?branch=main) -Bug tracking: Apache Jira +* Main branch [![main build status](https://github.com/apache/orc/actions/workflows/build_and_test.yml/badge.svg?branch=main)](https://github.com/apache/orc/actions/workflows/build_and_test.yml?query=branch%3Amain) +Bug tracking: [Apache Jira](https://orc.apache.org/bugs) The subdirectories are: + * c++ - the c++ reader and writer * cmake_modules - the cmake modules * docker - docker scripts to build and test on various linuxes @@ -43,10 +44,11 @@ The subdirectories are: ### Building * Install java 17 or higher -* Install maven 3.9.6 or higher +* Install maven 3.9.9 or higher * Install cmake 3.12 or higher To build a release version with debug information: + ```shell % mkdir build % cd build @@ -57,6 +59,7 @@ To build a release version with debug information: ``` To build a debug version: + ```shell % mkdir build % cd build @@ -67,6 +70,7 @@ To build a debug version: ``` To build a release version without debug information: + ```shell % mkdir build % cd build @@ -77,6 +81,7 @@ To build a release version without debug information: ``` To build only the Java library: + ```shell % cd java % ./mvnw package @@ -84,6 +89,7 @@ To build only the Java library: ``` To build only the C++ library: + ```shell % mkdir build % cd build @@ -94,6 +100,7 @@ To build only the C++ library: ``` To build the C++ library with AVX512 enabled: + ```shell export ORC_USER_SIMD_LEVEL=AVX512 % mkdir build @@ -102,8 +109,49 @@ export ORC_USER_SIMD_LEVEL=AVX512 % make package % make test-out ``` + Cmake option BUILD_ENABLE_AVX512 can be set to "ON" or (default value)"OFF" at the compile time. At compile time, it defines the SIMD level(AVX512) to be compiled into the binaries. Environment variable ORC_USER_SIMD_LEVEL can be set to "AVX512" or (default value)"NONE" at the run time. At run time, it defines the SIMD level to dispatch the code which can apply SIMD optimization. Note that if ORC_USER_SIMD_LEVEL is set to "NONE" at run time, AVX512 will not take effect at run time even if BUILD_ENABLE_AVX512 is set to "ON" at compile time. + +### Building with Meson + +While CMake is the official build system for orc, there is unofficial support for using Meson to build select parts of the project. To build a debug version of the library and test it using Meson, from the project root you can run: + +```shell +meson setup build +meson compile -C build +meson test -C build +``` + +By default, Meson will build unoptimized libraries with debug symbols. By contrast, the CMake build system generates release libraries by default. If you would like to create release libraries ala CMake, you should set the buildtype option. You must either remove the existing build directory before changing that setting, or alternatively pass the ``--reconfigure`` flag: + +```shell +meson setup build -Dbuildtype=release --reconfigure +meson compile -C build +meson test -C build +``` + +Meson supports running your test suite through valgrind out of the box: + +```shell +meson test -C build --wrap=valgrind +``` + +If you'd like to enable sanitizers, you can leverage the ``-Db_sanitize=`` option. For example, to enable both ASAN and UBSAN, you can run: + +```shell +meson setup build -Dbuildtype=debug -Db_sanitize=address,undefined --reconfigure +meson compile -C build +meson test +``` + +Meson takes care of detecting all dependencies on your system, and downloading missing ones as required through its [Wrap system](https://mesonbuild.com/Wrap-dependency-system-manual.html). The dependencies for the project are all stored in the ``subprojects`` directory in individual wrap files. The majority of these are system generated files created by running: + +```shell +meson wrap install +``` + +From the project root. If you are developing orc and need to add a new dependency in the future, be sure to check Meson's [WrapDB](https://mesonbuild.com/Wrapdb-projects.html) to check if a pre-configured wrap entry exists. If not, you may still manually configure the dependency as outlined in the aforementioned Wrap system documentation. diff --git a/c++/CMakeLists.txt b/c++/CMakeLists.txt index 449bd10f3e..38c38f7ce4 100644 --- a/c++/CMakeLists.txt +++ b/c++/CMakeLists.txt @@ -15,14 +15,23 @@ # specific language governing permissions and limitations # under the License. -include_directories ( - ${CMAKE_CURRENT_BINARY_DIR}/include - "include" - ) - add_subdirectory(include) add_subdirectory(src) if (BUILD_CPP_TESTS) add_subdirectory(test) endif () + +# Generate cmake package configuration files +include(CMakePackageConfigHelpers) +configure_package_config_file( + orcConfig.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/orcConfig.cmake + INSTALL_DESTINATION ${ORC_INSTALL_CMAKE_DIR}) +write_basic_package_version_file( + ${CMAKE_CURRENT_BINARY_DIR}/orcConfigVersion.cmake + VERSION ${ORC_VERSION} + COMPATIBILITY SameMajorVersion) +install(FILES + ${CMAKE_CURRENT_BINARY_DIR}/orcConfig.cmake + ${CMAKE_CURRENT_BINARY_DIR}/orcConfigVersion.cmake + DESTINATION ${ORC_INSTALL_CMAKE_DIR}) diff --git a/c++/build-support/README.md b/c++/build-support/README.md new file mode 100644 index 0000000000..80966104bb --- /dev/null +++ b/c++/build-support/README.md @@ -0,0 +1,30 @@ +# Build support + +The Python scripts under the folder provide capabilities for formatting code. +Make sure you've installed `clang-format-13`, `clang-tidy-13` and `clang-apply-replacements-13` and cmake could find them. +We enforce the version of tools because different versions of tools may generate different results. + +## clang-format + +To use `run_clang_format.py` you could act like below: + +```shell +mkdir build +cd build +cmake .. -DBUILD_JAVA=OFF -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang -DCMAKE_EXPORT_COMPILE_COMMANDS=1 -DORC_ENABLE_CLANG_TOOLS=1 +make check-format # Do checks only +make format # This would apply suggested changes, take care! +``` + +## clang-tidy + +To use `run_clang_tidy.py` you could act like below: + +```shell +mkdir build +cd build +cmake .. -DBUILD_JAVA=OFF -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang -DCMAKE_EXPORT_COMPILE_COMMANDS=1 -DORC_ENABLE_CLANG_TOOLS=1 +make -j`nproc` # Important +make check-clang-tidy # Do checks only +make fix-clang-tidy # This would apply suggested changes, take care! +``` diff --git a/c++/build-support/run_clang_format.py b/c++/build-support/run_clang_format.py new file mode 100644 index 0000000000..52d2e6b255 --- /dev/null +++ b/c++/build-support/run_clang_format.py @@ -0,0 +1,132 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import argparse +import codecs +import difflib +import fnmatch +import os +import subprocess +import sys + + +def check(arguments, source_dir): + formatted_filenames = [] + error = False + for directory, subdirs, filenames in os.walk(source_dir): + fullpaths = (os.path.join(directory, filename) + for filename in filenames) + source_files = [x for x in fullpaths + if x.endswith(".hh") or x.endswith(".cc")] + formatted_filenames.extend( + # Filter out files that match the globs in the globs file + [filename for filename in source_files + if not any((fnmatch.fnmatch(filename, exclude_glob) + for exclude_glob in exclude_globs))]) + + if arguments.fix: + if not arguments.quiet: + # Print out each file on its own line, but run + # clang format once for all of the files + print("\n".join(map(lambda x: "Formatting {}".format(x), + formatted_filenames))) + subprocess.check_call([arguments.clang_format_binary, + "-i"] + formatted_filenames) + else: + for filename in formatted_filenames: + if not arguments.quiet: + print("Checking {}".format(filename)) + # + # Due to some incompatibilities between Python 2 and + # Python 3, there are some specific actions we take here + # to make sure the difflib.unified_diff call works. + # + # In Python 2, the call to subprocess.check_output return + # a 'str' type. In Python 3, however, the call returns a + # 'bytes' type unless the 'encoding' argument is + # specified. Unfortunately, the 'encoding' argument is not + # in the Python 2 API. We could do an if/else here based + # on the version of Python we are running, but it's more + # straightforward to read the file in binary and do utf-8 + # conversion. In Python 2, it's just converting string + # types to unicode types, whereas in Python 3 it's + # converting bytes types to utf-8 encoded str types. This + # approach ensures that the arguments to + # difflib.unified_diff are acceptable string types in both + # Python 2 and Python 3. + with open(filename, "rb") as reader: + # Run clang-format and capture its output + formatted = subprocess.check_output( + [arguments.clang_format_binary, + filename]) + formatted = codecs.decode(formatted, "utf-8") + # Read the original file + original = codecs.decode(reader.read(), "utf-8") + # Run the equivalent of diff -u + diff = list(difflib.unified_diff( + original.splitlines(True), + formatted.splitlines(True), + fromfile=filename, + tofile="{} (after clang format)".format( + filename))) + if diff: + print("{} had clang-format style issues".format(filename)) + # Print out the diff to stderr + error = True + sys.stderr.writelines(diff) + return error + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Runs clang format on all of the source " + "files. If --fix is specified, and compares the output " + "with the existing file, outputting a unifiied diff if " + "there are any necessary changes") + parser.add_argument("clang_format_binary", + help="Path to the clang-format binary") + parser.add_argument("--exclude_globs", + help="Filename containing globs for files " + "that should be excluded from the checks") + parser.add_argument("--source_dirs", + help="Comma-separated root directories of the code") + parser.add_argument("--fix", default=False, + action="http://github.com/store_true", + help="If specified, will re-format the source " + "code instead of comparing the re-formatted " + "output, defaults to %(default)s") + parser.add_argument("--quiet", default=False, + action="http://github.com/store_true", + help="If specified, only print errors") + + args = parser.parse_args() + + had_err = False + exclude_globs = [] + if args.exclude_globs: + for line in open(args.exclude_globs): + if line.strip() == "": + continue + if line[0] == "#": + continue + exclude_globs.append(line.strip()) + + for source_dir in args.source_dirs.split(','): + if len(source_dir) > 0: + had_err = had_err or check(args, source_dir) + + sys.exit(1 if had_err else 0) \ No newline at end of file diff --git a/run_clang_tidy.py b/c++/build-support/run_clang_tidy.py old mode 100644 new mode 100755 similarity index 100% rename from run_clang_tidy.py rename to c++/build-support/run_clang_tidy.py diff --git a/c++/include/CMakeLists.txt b/c++/include/CMakeLists.txt index 056d1b9fab..a9f8b4a3b5 100644 --- a/c++/include/CMakeLists.txt +++ b/c++/include/CMakeLists.txt @@ -22,10 +22,11 @@ configure_file ( install(FILES "${CMAKE_CURRENT_BINARY_DIR}/orc/orc-config.hh" - DESTINATION "include/orc" + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/orc" ) install(DIRECTORY - "orc/" - DESTINATION "include/orc" - FILES_MATCHING PATTERN "*.hh") + "orc/" + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/orc" + FILES_MATCHING PATTERN "*.hh" + ) diff --git a/c++/include/orc/Exceptions.hh b/c++/include/orc/Exceptions.hh index 97cf5d8a0d..b19a00760c 100644 --- a/c++/include/orc/Exceptions.hh +++ b/c++/include/orc/Exceptions.hh @@ -67,6 +67,18 @@ namespace orc { SchemaEvolutionError(const SchemaEvolutionError&); SchemaEvolutionError& operator=(const SchemaEvolutionError&) = delete; }; + + class CompressionError : public std::runtime_error { + public: + explicit CompressionError(const std::string& whatArg); + explicit CompressionError(const char* whatArg); + ~CompressionError() noexcept override; + CompressionError(const CompressionError&); + + private: + CompressionError& operator=(const CompressionError&); + }; + } // namespace orc #endif diff --git a/c++/include/orc/Geospatial.hh b/c++/include/orc/Geospatial.hh new file mode 100644 index 0000000000..d3b9e28285 --- /dev/null +++ b/c++/include/orc/Geospatial.hh @@ -0,0 +1,196 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This file contains code adapted from the Apache Arrow project. + * + * Original source: + * https://github.com/apache/arrow/blob/main/cpp/src/parquet/geospatial/statistics.h + * + * The original code is licensed under the Apache License, Version 2.0. + * + * Modifications may have been made from the original source. + */ + +#ifndef ORC_GEOSPATIAL_HH +#define ORC_GEOSPATIAL_HH + +#include +#include +#include +#include + +namespace orc::geospatial { + + constexpr double INF = std::numeric_limits::infinity(); + // The maximum number of dimensions supported (X, Y, Z, M) + inline constexpr int MAX_DIMENSIONS = 4; + + // Supported combinations of geometry dimensions + enum class Dimensions { + XY = 0, // X and Y only + XYZ = 1, // X, Y, and Z + XYM = 2, // X, Y, and M + XYZM = 3, // X, Y, Z, and M + VALUE_MIN = 0, + VALUE_MAX = 3 + }; + + // Supported geometry types according to ISO WKB + enum class GeometryType { + POINT = 1, + LINESTRING = 2, + POLYGON = 3, + MULTIPOINT = 4, + MULTILINESTRING = 5, + MULTIPOLYGON = 6, + GEOMETRYCOLLECTION = 7, + VALUE_MIN = 1, + VALUE_MAX = 7 + }; + + // BoundingBox represents the minimum bounding rectangle (or box) for a geometry. + // It supports up to 4 dimensions (X, Y, Z, M). + struct BoundingBox { + using XY = std::array; + using XYZ = std::array; + using XYM = std::array; + using XYZM = std::array; + + // Default constructor: initializes to an empty bounding box. + BoundingBox() : min{INF, INF, INF, INF}, max{-INF, -INF, -INF, -INF} {} + // Constructor with explicit min/max values. + BoundingBox(const XYZM& mins, const XYZM& maxes) : min(mins), max(maxes) {} + BoundingBox(const BoundingBox& other) = default; + BoundingBox& operator=(const BoundingBox&) = default; + + // Update the bounding box to include a 2D coordinate. + void updateXY(const XY& coord) { + updateInternal(coord); + } + // Update the bounding box to include a 3D coordinate (XYZ). + void updateXYZ(const XYZ& coord) { + updateInternal(coord); + } + // Update the bounding box to include a 3D coordinate (XYM). + void updateXYM(const XYM& coord) { + std::array dims = {0, 1, 3}; + for (int i = 0; i < 3; ++i) { + auto dim = dims[i]; + if (!std::isnan(min[dim]) && !std::isnan(max[dim])) { + min[dim] = std::min(min[dim], coord[i]); + max[dim] = std::max(max[dim], coord[i]); + } + } + } + // Update the bounding box to include a 4D coordinate (XYZM). + void updateXYZM(const XYZM& coord) { + updateInternal(coord); + } + + // Reset the bounding box to its initial empty state. + void reset() { + for (int i = 0; i < MAX_DIMENSIONS; ++i) { + min[i] = INF; + max[i] = -INF; + } + } + + // Invalidate the bounding box (set all values to NaN). + void invalidate() { + for (int i = 0; i < MAX_DIMENSIONS; ++i) { + min[i] = std::numeric_limits::quiet_NaN(); + max[i] = std::numeric_limits::quiet_NaN(); + } + } + + // Check if the bound for a given dimension is empty. + bool boundEmpty(int dim) const { + return std::isinf(min[dim] - max[dim]); + } + + // Check if the bound for a given dimension is valid (not NaN). + bool boundValid(int dim) const { + return !std::isnan(min[dim]) && !std::isnan(max[dim]); + } + + // Get the lower bound (min values). + const XYZM& lowerBound() const { + return min; + } + // Get the upper bound (max values). + const XYZM& upperBound() const { + return max; + } + + // Get validity for each dimension. + std::array dimensionValid() const { + return {boundValid(0), boundValid(1), boundValid(2), boundValid(3)}; + } + // Get emptiness for each dimension. + std::array dimensionEmpty() const { + return {boundEmpty(0), boundEmpty(1), boundEmpty(2), boundEmpty(3)}; + } + + // Merge another bounding box into this one. + void merge(const BoundingBox& other) { + for (int i = 0; i < MAX_DIMENSIONS; ++i) { + if (std::isnan(min[i]) || std::isnan(max[i]) || std::isnan(other.min[i]) || + std::isnan(other.max[i])) { + min[i] = std::numeric_limits::quiet_NaN(); + max[i] = std::numeric_limits::quiet_NaN(); + } else { + min[i] = std::min(min[i], other.min[i]); + max[i] = std::max(max[i], other.max[i]); + } + } + } + + // Convert the bounding box to a string representation. + std::string toString() const; + + XYZM min; // Minimum values for each dimension + XYZM max; // Maximum values for each dimension + + private: + // Internal update function for XY, XYZ, or XYZM coordinates. + template + void updateInternal(const Coord& coord) { + for (size_t i = 0; i < coord.size(); ++i) { + if (!std::isnan(min[i]) && !std::isnan(max[i])) { + min[i] = std::min(min[i], coord[i]); + max[i] = std::max(max[i], coord[i]); + } + } + } + }; + + inline bool operator==(const BoundingBox& lhs, const BoundingBox& rhs) { + return lhs.min == rhs.min && lhs.max == rhs.max; + } + inline bool operator!=(const BoundingBox& lhs, const BoundingBox& rhs) { + return !(lhs == rhs); + } + inline std::ostream& operator<<(std::ostream& os, const BoundingBox& obj) { + os << obj.toString(); + return os; + } + +} // namespace orc::geospatial + +#endif // ORC_GEOSPATIAL_HH diff --git a/c++/include/orc/Int128.hh b/c++/include/orc/Int128.hh index 6954c771cf..e728e70e7b 100644 --- a/c++/include/orc/Int128.hh +++ b/c++/include/orc/Int128.hh @@ -193,43 +193,13 @@ namespace orc { * Shift left by the given number of bits. * Values larger than 2**127 will shift into the sign bit. */ - Int128& operator<<=(uint32_t bits) { - if (bits != 0) { - if (bits < 64) { - highbits_ <<= bits; - highbits_ |= (lowbits_ >> (64 - bits)); - lowbits_ <<= bits; - } else if (bits < 128) { - highbits_ = static_cast(lowbits_) << (bits - 64); - lowbits_ = 0; - } else { - highbits_ = 0; - lowbits_ = 0; - } - } - return *this; - } + Int128& operator<<=(uint32_t bits); /** * Shift right by the given number of bits. Negative values will * sign extend and fill with one bits. */ - Int128& operator>>=(uint32_t bits) { - if (bits != 0) { - if (bits < 64) { - lowbits_ >>= bits; - lowbits_ |= static_cast(highbits_ << (64 - bits)); - highbits_ = static_cast(static_cast(highbits_) >> bits); - } else if (bits < 128) { - lowbits_ = static_cast(highbits_ >> (bits - 64)); - highbits_ = highbits_ >= 0 ? 0 : -1l; - } else { - highbits_ = highbits_ >= 0 ? 0 : -1l; - lowbits_ = static_cast(highbits_); - } - } - return *this; - } + Int128& operator>>=(uint32_t bits); bool operator==(const Int128& right) const { return highbits_ == right.highbits_ && lowbits_ == right.lowbits_; diff --git a/c++/include/orc/OrcFile.hh b/c++/include/orc/OrcFile.hh index 6e4a07bf7c..ea71567c5f 100644 --- a/c++/include/orc/OrcFile.hh +++ b/c++/include/orc/OrcFile.hh @@ -19,6 +19,7 @@ #ifndef ORC_FILE_HH #define ORC_FILE_HH +#include #include #include "orc/Reader.hh" @@ -58,6 +59,18 @@ namespace orc { */ virtual void read(void* buf, uint64_t length, uint64_t offset) = 0; + /** + * Read data asynchronously into the buffer. The buffer is allocated by the caller. + * @param buf the buffer to read into + * @param length the number of bytes to read. + * @param offset the position in the stream to read from. + * @return a future that will be set when the read is complete. + */ + virtual std::future readAsync(void* buf, uint64_t length, uint64_t offset) { + return std::async(std::launch::async, + [this, buf, length, offset] { this->read(buf, length, offset); }); + } + /** * Get the name of the stream for error messages. */ @@ -127,8 +140,8 @@ namespace orc { * @param path the uri of the file in HDFS * @param metrics the metrics of the reader */ - std::unique_ptr readHdfsFile(const std::string& path, - ReaderMetrics* metrics = nullptr); + [[deprecated("readHdfsFile is deprecated in 2.0.1")]] std::unique_ptr readHdfsFile( + const std::string& path, ReaderMetrics* metrics = nullptr); /** * Create a reader to read the ORC file. diff --git a/c++/include/orc/Reader.hh b/c++/include/orc/Reader.hh index 4b254593ee..e9f420f113 100644 --- a/c++/include/orc/Reader.hh +++ b/c++/include/orc/Reader.hh @@ -40,6 +40,17 @@ namespace orc { struct ReaderOptionsPrivate; struct RowReaderOptionsPrivate; + struct CacheOptions { + // The maximum distance in bytes between two consecutive + // ranges; beyond this value, ranges are not combined + uint64_t holeSizeLimit = 8192; + + // The maximum size in bytes of a combined range; if + // combining two consecutive ranges would produce a range of a + // size greater than this, they are not combined + uint64_t rangeSizeLimit = 32 * 1024 * 1024; + }; + /** * Expose the reader metrics including the latency and * number of calls of the decompression/decoding/IO modules. @@ -59,9 +70,20 @@ namespace orc { std::atomic IOBlockingLatencyUs{0}; std::atomic SelectedRowGroupCount{0}; std::atomic EvaluatedRowGroupCount{0}; + std::atomic ReadRangeCacheHits{0}; + std::atomic ReadRangeCacheMisses{0}; }; ReaderMetrics* getDefaultReaderMetrics(); + // Row group index of a single column in a stripe. + struct RowGroupIndex { + // Positions are represented as a two-dimensional array where the first + // dimension is row group index and the second dimension is the position + // list of the row group. The size of the second dimension should be equal + // among all row groups. + std::vector> positions; + }; + /** * Options for creating a Reader. */ @@ -107,6 +129,11 @@ namespace orc { */ ReaderOptions& setReaderMetrics(ReaderMetrics* metrics); + /** + * Set the cache options. + */ + ReaderOptions& setCacheOptions(const CacheOptions& cacheOptions); + /** * Set the location of the tail as defined by the logical length of the * file. @@ -138,6 +165,11 @@ namespace orc { * Get the reader metrics. */ ReaderMetrics* getReaderMetrics() const; + + /** + * Set the cache options. + */ + const CacheOptions& getCacheOptions() const; }; /** @@ -466,9 +498,11 @@ namespace orc { /** * Get the statistics about a stripe. * @param stripeIndex the index of the stripe (0 to N-1) to get statistics about - * @return the statistics about that stripe + * @param includeRowIndex whether the row index of the stripe is included + * @return the statistics about that stripe and row group index statistics */ - virtual std::unique_ptr getStripeStatistics(uint64_t stripeIndex) const = 0; + virtual std::unique_ptr getStripeStatistics( + uint64_t stripeIndex, bool includeRowIndex = true) const = 0; /** * Get the length of the data stripes in the file. @@ -605,6 +639,33 @@ namespace orc { */ virtual std::map getBloomFilters( uint32_t stripeIndex, const std::set& included) const = 0; + + /** + * Get row group index of all selected columns in the specified stripe + * @param stripeIndex index of the stripe to be read for row group index. + * @param included index of selected columns to return (if not specified, + * all columns will be returned). + * @return map of row group index keyed by its column index. + */ + virtual std::map getRowGroupIndex( + uint32_t stripeIndex, const std::set& included = {}) const = 0; + + /** + * Trigger IO prefetch and cache the prefetched contents asynchronously. + * It is thread safe. Users should make sure requested stripes and columns + * are not overlapped, otherwise the overlapping part will be prefetched multiple time, + * which doesn't affect correctness but waste IO and memory resources. + * @param stripes the stripes to prefetch + * @param includeTypes the types to prefetch + */ + virtual void preBuffer(const std::vector& stripes, + const std::list& includeTypes) = 0; + + /** + * Release cached entries whose right boundary is less than or equal to the given boundary. + * @param boundary the boundary value to release cache entries + */ + virtual void releaseBuffer(uint64_t boundary) = 0; }; /** diff --git a/c++/include/orc/Statistics.hh b/c++/include/orc/Statistics.hh index 4ba8c35f7d..58169abe59 100644 --- a/c++/include/orc/Statistics.hh +++ b/c++/include/orc/Statistics.hh @@ -19,12 +19,11 @@ #ifndef ORC_STATISTICS_HH #define ORC_STATISTICS_HH +#include "orc/Geospatial.hh" #include "orc/Type.hh" #include "orc/Vector.hh" #include "orc/orc-config.hh" -#include - namespace orc { /** @@ -367,6 +366,33 @@ namespace orc { virtual int32_t getMaximumNanos() const = 0; }; + /** + * Statistics for Geometry and Geography + */ + class GeospatialColumnStatistics : public ColumnStatistics { + public: + virtual ~GeospatialColumnStatistics(); + + /** + * Get bounding box + * @return bounding box + */ + virtual const geospatial::BoundingBox& getBoundingBox() const = 0; + + /** + * Get geospatial types + * @return a sorted vector of geometry type IDs that elements is unique + */ + virtual std::vector getGeospatialTypes() const = 0; + + /** + * Update stats by a new value + * @param value new value to update + * @param length length of the value + */ + virtual void update(const char* value, size_t length) = 0; + }; + class Statistics { public: virtual ~Statistics(); diff --git a/c++/include/orc/Type.hh b/c++/include/orc/Type.hh index 82e0e3cc86..4bb794ff34 100644 --- a/c++/include/orc/Type.hh +++ b/c++/include/orc/Type.hh @@ -25,6 +25,18 @@ namespace orc { + namespace geospatial { + enum EdgeInterpolationAlgorithm { + SPHERICAL = 0, + VINCENTY = 1, + THOMAS = 2, + ANDOYER = 3, + KARNEY = 4 + }; + std::string AlgoToString(EdgeInterpolationAlgorithm algo); + EdgeInterpolationAlgorithm AlgoFromString(const std::string& algo); + } // namespace geospatial + enum TypeKind { BOOLEAN = 0, BYTE = 1, @@ -44,7 +56,9 @@ namespace orc { DATE = 15, VARCHAR = 16, CHAR = 17, - TIMESTAMP_INSTANT = 18 + TIMESTAMP_INSTANT = 18, + GEOMETRY = 19, + GEOGRAPHY = 20 }; class Type { @@ -59,6 +73,10 @@ namespace orc { virtual uint64_t getMaximumLength() const = 0; virtual uint64_t getPrecision() const = 0; virtual uint64_t getScale() const = 0; + // for geospatial types only + virtual const std::string& getCrs() const = 0; + // for geography type only + virtual geospatial::EdgeInterpolationAlgorithm getAlgorithm() const = 0; virtual Type& setAttribute(const std::string& key, const std::string& value) = 0; virtual bool hasAttributeKey(const std::string& key) const = 0; virtual Type& removeAttribute(const std::string& key) = 0; @@ -115,6 +133,10 @@ namespace orc { std::unique_ptr createListType(std::unique_ptr elements); std::unique_ptr createMapType(std::unique_ptr key, std::unique_ptr value); std::unique_ptr createUnionType(); + std::unique_ptr createGeometryType(const std::string& crs = "OGC:CRS84"); + std::unique_ptr createGeographyType( + const std::string& crs = "OGC:CRS84", + geospatial::EdgeInterpolationAlgorithm algo = geospatial::SPHERICAL); } // namespace orc #endif diff --git a/c++/include/orc/Vector.hh b/c++/include/orc/Vector.hh index 0dfe926965..663bef9cd7 100644 --- a/c++/include/orc/Vector.hh +++ b/c++/include/orc/Vector.hh @@ -57,6 +57,8 @@ namespace orc { bool hasNulls; // whether the vector batch is encoded bool isEncoded; + // whether the dictionary is decoded into vector batch + bool dictionaryDecoded; // custom memory pool MemoryPool& memoryPool; @@ -88,6 +90,14 @@ namespace orc { */ virtual bool hasVariableLength(); + /** + * Decode possible dictionary into vector batch. + */ + void decodeDictionary(); + + protected: + virtual void decodeDictionaryImpl() {} + private: ColumnVectorBatch(const ColumnVectorBatch&); ColumnVectorBatch& operator=(const ColumnVectorBatch&); @@ -248,6 +258,10 @@ namespace orc { ~EncodedStringVectorBatch() override; std::string toString() const override; void resize(uint64_t capacity) override; + + // Calculate data and length in StringVectorBatch from dictionary and index + void decodeDictionaryImpl() override; + std::shared_ptr dictionary; // index for dictionary entry @@ -264,6 +278,9 @@ namespace orc { bool hasVariableLength() override; std::vector fields; + + protected: + void decodeDictionaryImpl() override; }; struct ListVectorBatch : public ColumnVectorBatch { @@ -283,6 +300,9 @@ namespace orc { // the concatenated elements std::unique_ptr elements; + + protected: + void decodeDictionaryImpl() override; }; struct MapVectorBatch : public ColumnVectorBatch { @@ -304,6 +324,9 @@ namespace orc { std::unique_ptr keys; // the concatenated elements std::unique_ptr elements; + + protected: + void decodeDictionaryImpl() override; }; struct UnionVectorBatch : public ColumnVectorBatch { @@ -327,6 +350,9 @@ namespace orc { // the sub-columns std::vector children; + + protected: + void decodeDictionaryImpl() override; }; struct Decimal { diff --git a/c++/include/orc/Writer.hh b/c++/include/orc/Writer.hh index 7968fbce7f..78f06739bc 100644 --- a/c++/include/orc/Writer.hh +++ b/c++/include/orc/Writer.hh @@ -277,6 +277,32 @@ namespace orc { * @return if not set, return default value which is 1 MB. */ uint64_t getOutputBufferCapacity() const; + + /** + * Set the initial block size of original input buffer in the class CompressionStream. + * the input buffer is used to store raw data before compression, while the output buffer is + * dedicated to holding compressed data + */ + WriterOptions& setMemoryBlockSize(uint64_t capacity); + + /** + * Get the initial block size of original input buffer in the class CompressionStream. + * @return if not set, return default value which is 64 KB. + */ + uint64_t getMemoryBlockSize() const; + + /** + * Set whether the compression block should be aligned to row group boundary. + * The boolean type may not be aligned to row group boundary due to the + * requirement of the Boolean RLE encoder to pack input bits into bytes + */ + WriterOptions& setAlignBlockBoundToRowGroup(bool alignBlockBoundToRowGroup); + + /** + * Get if the compression block should be aligned to row group boundary. + * @return if not set, return default value which is false. + */ + bool getAlignBlockBoundToRowGroup() const; }; class Writer { diff --git a/c++/include/orc/meson.build b/c++/include/orc/meson.build new file mode 100644 index 0000000000..e2524051f0 --- /dev/null +++ b/c++/include/orc/meson.build @@ -0,0 +1,57 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +cdata = configuration_data() +cdata.set('ORC_VERSION', meson.project_version()) +cdata.set('ORC_CXX_HAS_CSTDINT', 1) + +configure_file( + input: 'orc-config.hh.in', + output: 'orc-config.hh', + configuration: cdata, + format: 'cmake', + install: true, + install_dir: 'orc', +) + +install_headers( + [ + 'BloomFilter.hh', + 'ColumnPrinter.hh', + 'Common.hh', + 'Exceptions.hh', + 'Geospatial.hh', + 'Int128.hh', + 'MemoryPool.hh', + 'OrcFile.hh', + 'Reader.hh', + 'Statistics.hh', + 'Type.hh', + 'Vector.hh', + 'Writer.hh', + ], + subdir: 'orc', +) + +install_headers( + [ + 'sargs/Literal.hh', + 'sargs/SearchArgument.hh', + 'sargs/TruthValue.hh', + ], + subdir: 'orc/sargs', +) diff --git a/c++/include/orc/sargs/SearchArgument.hh b/c++/include/orc/sargs/SearchArgument.hh index 6493840a92..2fa3ea04cb 100644 --- a/c++/include/orc/sargs/SearchArgument.hh +++ b/c++/include/orc/sargs/SearchArgument.hh @@ -251,6 +251,12 @@ namespace orc { * @return the new SearchArgument */ virtual std::unique_ptr build() = 0; + + /** + * Add a maybe leaf to the current item on the stack. + * @return this + */ + virtual SearchArgumentBuilder& maybe() = 0; }; /** diff --git a/c++/meson.build b/c++/meson.build new file mode 100644 index 0000000000..216d7e5634 --- /dev/null +++ b/c++/meson.build @@ -0,0 +1,40 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# required dependencies +protobuf_dep = dependency('protobuf', fallback: ['protobuf', 'protobuf_dep']) +lz4_dep = dependency('liblz4') +snappy_dep = dependency('snappy') +zlib_dep = dependency('zlib') +zstd_dep = dependency('libzstd') +sparsehash_c11_dep = dependency('sparsehash-c11') + +# optional dependencies (should be set later in configuration) +gtest_dep = disabler() +gmock_dep = disabler() + +subdir('include/orc') +subdir('src') + +if get_option('tests').enabled() + gtest_dep = dependency('gtest') + gmock_dep = dependency('gmock') + subdir('test') +endif + +pkg = import('pkgconfig') +pkg.generate(orc_lib) diff --git a/c++/orcConfig.cmake.in b/c++/orcConfig.cmake.in new file mode 100644 index 0000000000..49663b3423 --- /dev/null +++ b/c++/orcConfig.cmake.in @@ -0,0 +1,103 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# This config sets the following variables in your project: +# +# orc_VERSION - version of the found ORC +# orc_FOUND - true if ORC found on the system +# +# This config sets the following targets in your project: +# +# orc::orc - for linked as static library +# +# For backward compatibility, this config also sets the following variables: +# +# ORC_FOUND - same as orc_FOUND above +# ORC_STATIC_LIB - static library of the found ORC +# ORC_INCLUDE_DIR - include directory of the found ORC +# ORC_INCLUDE_DIRS - same as ORC_INCLUDE_DIR above + +@PACKAGE_INIT@ + +set(ORC_VENDOR_DEPENDENCIES "@ORC_VENDOR_DEPENDENCIES@") +set(ORC_SYSTEM_DEPENDENCIES "@ORC_SYSTEM_DEPENDENCIES@") + +if(DEFINED CMAKE_MODULE_PATH) + set(ORC_CMAKE_MODULE_PATH_OLD ${CMAKE_MODULE_PATH}) +else() + unset(ORC_CMAKE_MODULE_PATH_OLD) +endif() +set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}") + +include(CMakeFindDependencyMacro) +foreach(dependency ${ORC_SYSTEM_DEPENDENCIES}) + find_dependency(${dependency}) +endforeach() + +if(DEFINED ORC_CMAKE_MODULE_PATH_OLD) + set(CMAKE_MODULE_PATH ${ORC_CMAKE_MODULE_PATH_OLD}) + unset(ORC_CMAKE_MODULE_PATH_OLD) +else() + unset(CMAKE_MODULE_PATH) +endif() + +include("${CMAKE_CURRENT_LIST_DIR}/orcTargets.cmake") + +get_target_property(orc_static_configurations orc::orc IMPORTED_CONFIGURATIONS) + +foreach(dependency ${ORC_VENDOR_DEPENDENCIES}) + string(REPLACE "|" ";" dependency_pair ${dependency}) + list(LENGTH dependency_pair dependency_pair_length) + if(NOT dependency_pair_length EQUAL 2) + message(FATAL_ERROR "Invalid vendor dependency: ${dependency}") + endif() + list(GET dependency_pair 0 target_name) + list(GET dependency_pair 1 static_lib_name) + + add_library("${target_name}" STATIC IMPORTED) + + foreach(CONFIGURATION ${orc_static_configurations}) + string(TOUPPER "${CONFIGURATION}" CONFIGURATION) + get_target_property(orc_static_location orc::orc LOCATION_${CONFIGURATION}) + get_filename_component(orc_lib_dir "${orc_static_location}" DIRECTORY) + set_property(TARGET "${target_name}" + APPEND + PROPERTY IMPORTED_CONFIGURATIONS ${CONFIGURATION}) + set_target_properties("${target_name}" + PROPERTIES IMPORTED_LOCATION_${CONFIGURATION} + "${orc_lib_dir}/${static_lib_name}") + endforeach() +endforeach() + +check_required_components(orc) + +foreach(BUILD_TYPE_SUFFIX + "_RELEASE" + "_RELWITHDEBINFO" + "_MINSIZEREL" + "_DEBUG" + "") + if(NOT ORC_STATIC_LIB) + get_target_property(ORC_STATIC_LIB orc::orc IMPORTED_LOCATION${BUILD_TYPE_SUFFIX}) + endif() +endforeach() + +get_target_property(ORC_INCLUDE_DIR orc::orc INTERFACE_INCLUDE_DIRECTORIES) + +set(ORC_FOUND TRUE) +set(ORC_VERSION ${orc_VERSION}) +set(ORC_INCLUDE_DIRS ${ORC_INCLUDE_DIR}) diff --git a/c++/src/Adaptor.hh.in b/c++/src/Adaptor.hh.in index 2cce8158e2..f3ed763eb3 100644 --- a/c++/src/Adaptor.hh.in +++ b/c++/src/Adaptor.hh.in @@ -49,6 +49,12 @@ typedef SSIZE_T ssize_t; ssize_t pread(int fd, void* buf, size_t count, off_t offset); #endif +#if defined(__GNUC__) || defined(__clang__) + #define NO_SANITIZE_ATTR __attribute__((no_sanitize("signed-integer-overflow", "shift"))) +#else + #define NO_SANITIZE_ATTR +#endif + #ifdef HAS_DIAGNOSTIC_PUSH #ifdef __clang__ #define DIAGNOSTIC_PUSH _Pragma("clang diagnostic push") diff --git a/c++/src/BlockBuffer.hh b/c++/src/BlockBuffer.hh index 2faf38f7f9..6d265b0e32 100644 --- a/c++/src/BlockBuffer.hh +++ b/c++/src/BlockBuffer.hh @@ -106,12 +106,14 @@ namespace orc { } void resize(uint64_t size); + /** * Requests the BlockBuffer to contain at least newCapacity bytes. * Reallocation happens if there is need of more space. * @param newCapacity new capacity of BlockBuffer */ void reserve(uint64_t newCapacity); + /** * Write the BlockBuffer content into OutputStream * @param output the output stream to write to diff --git a/c++/src/BloomFilter.cc b/c++/src/BloomFilter.cc index 887637223a..025bdd8a03 100644 --- a/c++/src/BloomFilter.cc +++ b/c++/src/BloomFilter.cc @@ -208,7 +208,7 @@ namespace orc { } DIAGNOSTIC_POP - + NO_SANITIZE_ATTR void BloomFilterImpl::addHash(int64_t hash64) { int32_t hash1 = static_cast(hash64 & 0xffffffff); // In Java codes, we use "hash64 >>> 32" which is an unsigned shift op. @@ -226,6 +226,7 @@ namespace orc { } } + NO_SANITIZE_ATTR bool BloomFilterImpl::testHash(int64_t hash64) const { int32_t hash1 = static_cast(hash64 & 0xffffffff); // In Java codes, we use "hash64 >>> 32" which is an unsigned shift op. diff --git a/c++/src/BloomFilter.hh b/c++/src/BloomFilter.hh index ebc4a5ee04..75fb02a026 100644 --- a/c++/src/BloomFilter.hh +++ b/c++/src/BloomFilter.hh @@ -194,6 +194,7 @@ namespace orc { // Thomas Wang's integer hash function // http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm // Put this in header file so tests can use it as well. + NO_SANITIZE_ATTR inline int64_t getLongHash(int64_t key) { key = (~key) + (key << 21); // key = (key << 21) - key - 1; key = key ^ (key >> 24); diff --git a/c++/src/ByteRLE.cc b/c++/src/ByteRLE.cc index bdbaad1da6..ded9f55a00 100644 --- a/c++/src/ByteRLE.cc +++ b/c++/src/ByteRLE.cc @@ -63,6 +63,8 @@ namespace orc { virtual void suppress() override; + virtual void finishEncode() override; + /** * Reset to initial state */ @@ -186,16 +188,17 @@ namespace orc { void ByteRleEncoderImpl::recordPosition(PositionRecorder* recorder) const { uint64_t flushedSize = outputStream->getSize(); - uint64_t unflushedSize = static_cast(bufferPosition); + uint64_t unusedBufferSize = static_cast(bufferLength - bufferPosition); if (outputStream->isCompressed()) { // start of the compression chunk in the stream recorder->add(flushedSize); - // number of decompressed bytes that need to be consumed - recorder->add(unflushedSize); + // There are multiple blocks in the input buffer, but bufferPosition only records the + // effective length of the last block. We need rawInputBufferSize to record the total length + // of all variable blocks. + recorder->add(outputStream->getRawInputBufferSize() - unusedBufferSize); } else { - flushedSize -= static_cast(bufferLength); // byte offset of the RLE run’s start location - recorder->add(flushedSize + unflushedSize); + recorder->add(flushedSize - unusedBufferSize); } recorder->add(static_cast(numLiterals)); } @@ -215,6 +218,13 @@ namespace orc { reset(); } + void ByteRleEncoderImpl::finishEncode() { + writeValues(); + outputStream->BackUp(bufferLength - bufferPosition); + outputStream->finishStream(); + bufferLength = bufferPosition = 0; + } + std::unique_ptr createByteRleEncoder( std::unique_ptr output) { return std::make_unique(std::move(output)); diff --git a/c++/src/ByteRLE.hh b/c++/src/ByteRLE.hh index bd19f52ecc..bee064f666 100644 --- a/c++/src/ByteRLE.hh +++ b/c++/src/ByteRLE.hh @@ -59,6 +59,13 @@ namespace orc { * suppress the data and reset to initial state */ virtual void suppress() = 0; + + /** + * Finalize the encoding process. This function should be called after all data required for + * encoding has been added. It ensures that any remaining data is processed and the final state + * of the encoder is set. + */ + virtual void finishEncode() = 0; }; class ByteRleDecoder { diff --git a/c++/src/CMakeLists.txt b/c++/src/CMakeLists.txt index 33ad584840..b8a168307c 100644 --- a/c++/src/CMakeLists.txt +++ b/c++/src/CMakeLists.txt @@ -138,12 +138,6 @@ configure_file ( "${CMAKE_CURRENT_BINARY_DIR}/Adaptor.hh" ) -include_directories ( - ${CMAKE_CURRENT_SOURCE_DIR} - ${CMAKE_CURRENT_BINARY_DIR} - ${LIBHDFSPP_INCLUDE_DIR} - ) - add_custom_command(OUTPUT orc_proto.pb.h orc_proto.pb.cc COMMAND ${PROTOBUF_EXECUTABLE} -I ../../orc-format_ep-prefix/src/orc-format_ep/src/main/proto/orc/proto @@ -156,6 +150,7 @@ set(SOURCE_FILES orc_proto.pb.h io/InputStream.cc io/OutputStream.cc + io/Cache.cc sargs/ExpressionTree.cc sargs/Literal.cc sargs/PredicateLeaf.cc @@ -176,6 +171,7 @@ set(SOURCE_FILES ConvertColumnReader.cc CpuInfoUtil.cc Exceptions.cc + Geospatial.cc Int128.cc LzoDecompressor.cc MemoryPool.cc @@ -197,7 +193,6 @@ set(SOURCE_FILES if(BUILD_LIBHDFSPP) set(SOURCE_FILES ${SOURCE_FILES} OrcHdfsFile.cc) - add_definitions(-DBUILD_LIBHDFSPP) endif(BUILD_LIBHDFSPP) if(BUILD_ENABLE_AVX512) @@ -209,14 +204,46 @@ endif(BUILD_ENABLE_AVX512) add_library (orc STATIC ${SOURCE_FILES}) target_link_libraries (orc - orc::protobuf - orc::zlib - orc::snappy - orc::lz4 - orc::zstd - ${LIBHDFSPP_LIBRARIES} + INTERFACE + ${ORC_INSTALL_INTERFACE_TARGETS} + PRIVATE + $ + $ + $ + $ + $ + $ + $ ) +target_include_directories (orc + INTERFACE + $ + PUBLIC + $ + $ + PRIVATE + ${CMAKE_CURRENT_BINARY_DIR} + ${CMAKE_CURRENT_SOURCE_DIR} + ${LIBHDFSPP_INCLUDE_DIR} +) + +if (BUILD_LIBHDFSPP) + target_compile_definitions(orc PUBLIC -DBUILD_LIBHDFSPP) +endif (BUILD_LIBHDFSPP) + +if (BUILD_CPP_ENABLE_METRICS) + message(STATUS "Enable the metrics collection") + target_compile_definitions(orc PUBLIC ENABLE_METRICS=1) +else () + message(STATUS "Disable the metrics collection") + target_compile_definitions(orc PUBLIC ENABLE_METRICS=0) +endif () + add_dependencies(orc orc-format_ep) -install(TARGETS orc DESTINATION lib) +install(TARGETS orc EXPORT orc_targets) +install(EXPORT orc_targets + DESTINATION ${ORC_INSTALL_CMAKE_DIR} + NAMESPACE "orc::" + FILE "orcTargets.cmake") diff --git a/c++/src/ColumnPrinter.cc b/c++/src/ColumnPrinter.cc index 8b16ecbd09..6535c612ce 100644 --- a/c++/src/ColumnPrinter.cc +++ b/c++/src/ColumnPrinter.cc @@ -254,6 +254,8 @@ namespace orc { break; case BINARY: + case GEOMETRY: + case GEOGRAPHY: result = std::make_unique(buffer, param); break; diff --git a/c++/src/ColumnReader.cc b/c++/src/ColumnReader.cc index e70f916ffd..89ff0e0245 100644 --- a/c++/src/ColumnReader.cc +++ b/c++/src/ColumnReader.cc @@ -395,7 +395,7 @@ namespace orc { int64_t bits = 0; if (bufferEnd_ - bufferPointer_ >= 8) { if (isLittleEndian) { - bits = *(reinterpret_cast(bufferPointer_)); + memcpy(&bits, bufferPointer_, sizeof(bits)); } else { bits = static_cast(static_cast(bufferPointer_[0])); bits |= static_cast(static_cast(bufferPointer_[1])) << 8; @@ -509,8 +509,10 @@ namespace orc { bufferNum = std::min(numValues, static_cast(bufferEnd_ - bufferPointer_) / bytesPerValue_); uint64_t bufferBytes = bufferNum * bytesPerValue_; - memcpy(outArray, bufferPointer_, bufferBytes); - bufferPointer_ += bufferBytes; + if (bufferBytes > 0) { + memcpy(outArray, bufferPointer_, bufferBytes); + bufferPointer_ += bufferBytes; + } } for (size_t i = bufferNum; i < numValues; ++i) { outArray[i] = readDouble(); @@ -724,6 +726,9 @@ namespace orc { if (totalBytes <= lastBufferLength_) { // subtract the needed bytes from the ones left over lastBufferLength_ -= totalBytes; + if (lastBuffer_ == nullptr) { + throw ParseError("StringDirectColumnReader::skip: lastBuffer_ is null"); + } lastBuffer_ += totalBytes; } else { // move the stream forward after accounting for the buffered bytes @@ -778,7 +783,9 @@ namespace orc { byteBatch.blob.resize(totalLength); char* ptr = byteBatch.blob.data(); while (bytesBuffered + lastBufferLength_ < totalLength) { - memcpy(ptr + bytesBuffered, lastBuffer_, lastBufferLength_); + if (lastBuffer_ != nullptr) { + memcpy(ptr + bytesBuffered, lastBuffer_, lastBufferLength_); + } bytesBuffered += lastBufferLength_; const void* readBuffer; int readLength; @@ -1740,6 +1747,8 @@ namespace orc { case CHAR: case STRING: case VARCHAR: + case GEOMETRY: + case GEOGRAPHY: switch (static_cast(stripe.getEncoding(type.getColumnId()).kind())) { case proto::ColumnEncoding_Kind_DICTIONARY: case proto::ColumnEncoding_Kind_DICTIONARY_V2: diff --git a/c++/src/ColumnWriter.cc b/c++/src/ColumnWriter.cc index 86e30ce90d..915277ef41 100644 --- a/c++/src/ColumnWriter.cc +++ b/c++/src/ColumnWriter.cc @@ -17,13 +17,19 @@ */ #include "orc/Int128.hh" +#include "orc/Statistics.hh" +#include "orc/Type.hh" #include "orc/Writer.hh" +#include #include "ByteRLE.hh" #include "ColumnWriter.hh" #include "RLE.hh" #include "Statistics.hh" #include "Timezone.hh" +#include "Utils.hh" + +#include namespace orc { StreamsFactory::~StreamsFactory() { @@ -47,11 +53,11 @@ namespace orc { // In the future, we can decide compression strategy and modifier // based on stream kind. But for now we just use the setting from // WriterOption - return createCompressor(options_.getCompression(), outStream_, - options_.getCompressionStrategy(), - // BufferedOutputStream initial capacity - options_.getOutputBufferCapacity(), options_.getCompressionBlockSize(), - *options_.getMemoryPool(), options_.getWriterMetrics()); + return createCompressor( + options_.getCompression(), outStream_, options_.getCompressionStrategy(), + // BufferedOutputStream initial capacity + options_.getOutputBufferCapacity(), options_.getCompressionBlockSize(), + options_.getMemoryBlockSize(), *options_.getMemoryPool(), options_.getWriterMetrics()); } std::unique_ptr createStreamsFactory(const WriterOptions& options, @@ -253,6 +259,10 @@ namespace orc { // PASS } + void ColumnWriter::finishStreams() { + notNullEncoder->finishEncode(); + } + class StructColumnWriter : public ColumnWriter { public: StructColumnWriter(const Type& type, const StreamsFactory& factory, @@ -282,6 +292,8 @@ namespace orc { virtual void reset() override; + virtual void finishStreams() override; + private: std::vector> children_; }; @@ -415,6 +427,13 @@ namespace orc { } } + void StructColumnWriter::finishStreams() { + ColumnWriter::finishStreams(); + for (uint32_t i = 0; i < children_.size(); ++i) { + children_[i]->finishStreams(); + } + } + template class IntegerColumnWriter : public ColumnWriter { public: @@ -432,6 +451,8 @@ namespace orc { virtual void recordPosition() const override; + virtual void finishStreams() override; + protected: std::unique_ptr rleEncoder; @@ -527,6 +548,12 @@ namespace orc { rleEncoder->recordPosition(rowIndexPosition.get()); } + template + void IntegerColumnWriter::finishStreams() { + ColumnWriter::finishStreams(); + rleEncoder->finishEncode(); + } + template class ByteColumnWriter : public ColumnWriter { public: @@ -543,6 +570,8 @@ namespace orc { virtual void recordPosition() const override; + virtual void finishStreams() override; + private: std::unique_ptr byteRleEncoder_; }; @@ -591,7 +620,7 @@ namespace orc { if (enableBloomFilter) { bloomFilter->addLong(data[i]); } - intStats->update(static_cast(byteData[i]), 1); + intStats->update(static_cast(static_cast(byteData[i])), 1); } } intStats->increase(count); @@ -636,6 +665,12 @@ namespace orc { byteRleEncoder_->recordPosition(rowIndexPosition.get()); } + template + void ByteColumnWriter::finishStreams() { + ColumnWriter::finishStreams(); + byteRleEncoder_->finishEncode(); + } + template class BooleanColumnWriter : public ColumnWriter { public: @@ -653,6 +688,8 @@ namespace orc { virtual void recordPosition() const override; + virtual void finishStreams() override; + private: std::unique_ptr rleEncoder_; }; @@ -749,6 +786,12 @@ namespace orc { rleEncoder_->recordPosition(rowIndexPosition.get()); } + template + void BooleanColumnWriter::finishStreams() { + ColumnWriter::finishStreams(); + rleEncoder_->finishEncode(); + } + template class FloatingColumnWriter : public ColumnWriter { public: @@ -766,6 +809,8 @@ namespace orc { virtual void recordPosition() const override; + virtual void finishStreams() override; + private: bool isFloat_; std::unique_ptr dataStream_; @@ -877,30 +922,36 @@ namespace orc { dataStream_->recordPosition(rowIndexPosition.get()); } + template + void FloatingColumnWriter::finishStreams() { + ColumnWriter::finishStreams(); + dataStream_->finishStream(); + } + /** * Implementation of increasing sorted string dictionary */ class SortedStringDictionary { public: struct DictEntry { - DictEntry(const char* str, size_t len) : data(str), length(len) {} - const char* data; - size_t length; + DictEntry(const char* str, size_t len) : data(std::make_unique(str, len)) {} + + std::unique_ptr data; }; - SortedStringDictionary() : totalLength_(0) {} + SortedStringDictionary() : totalLength_(0) { + /// Need to set empty key otherwise dense_hash_map will not work correctly + keyToIndex_.set_empty_key(std::string_view{}); + } // insert a new string into dictionary, return its insertion order - size_t insert(const char* data, size_t len); + size_t insert(const char* str, size_t len); // write dictionary data & length to output buffer void flush(AppendOnlyBufferedStream* dataStream, RleEncoder* lengthEncoder) const; - // reorder input index buffer from insertion order to dictionary order - void reorder(std::vector& idxBuffer) const; - // get dict entries in insertion order - void getEntriesInInsertionOrder(std::vector&) const; + const std::vector& getEntriesInInsertionOrder() const; // return count of entries size_t size() const; @@ -911,18 +962,11 @@ namespace orc { void clear(); private: - struct LessThan { - bool operator()(const DictEntry& left, const DictEntry& right) const { - int ret = memcmp(left.data, right.data, std::min(left.length, right.length)); - if (ret != 0) { - return ret < 0; - } - return left.length < right.length; - } - }; + // store dictionary entries in insertion order + mutable std::vector flatDict_; - std::map dict_; - std::vector> data_; + // map from string to its insertion order index + google::dense_hash_map keyToIndex_; uint64_t totalLength_; // use friend class here to avoid being bothered by const function calls @@ -935,64 +979,39 @@ namespace orc { // insert a new string into dictionary, return its insertion order size_t SortedStringDictionary::insert(const char* str, size_t len) { - auto ret = dict_.insert({DictEntry(str, len), dict_.size()}); - if (ret.second) { - // make a copy to internal storage - data_.push_back(std::vector(len)); - memcpy(data_.back().data(), str, len); - // update dictionary entry to link pointer to internal storage - DictEntry* entry = const_cast(&(ret.first->first)); - entry->data = data_.back().data(); + size_t index = flatDict_.size(); + + auto it = keyToIndex_.find(std::string_view{str, len}); + if (it != keyToIndex_.end()) { + return it->second; + } else { + flatDict_.emplace_back(str, len); totalLength_ += len; + + const auto& lastEntry = flatDict_.back(); + keyToIndex_.emplace(std::string_view{lastEntry.data->data(), lastEntry.data->size()}, index); + return index; } - return ret.first->second; } // write dictionary data & length to output buffer void SortedStringDictionary::flush(AppendOnlyBufferedStream* dataStream, RleEncoder* lengthEncoder) const { - for (auto it = dict_.cbegin(); it != dict_.cend(); ++it) { - dataStream->write(it->first.data, it->first.length); - lengthEncoder->write(static_cast(it->first.length)); - } - } - - /** - * Reorder input index buffer from insertion order to dictionary order - * - * We require this function because string values are buffered by indexes - * in their insertion order. Until the entire dictionary is complete can - * we get their sorted indexes in the dictionary in that ORC specification - * demands dictionary should be ordered. Therefore this function transforms - * the indexes from insertion order to dictionary value order for final - * output. - */ - void SortedStringDictionary::reorder(std::vector& idxBuffer) const { - // iterate the dictionary to get mapping from insertion order to value order - std::vector mapping(dict_.size()); - size_t dictIdx = 0; - for (auto it = dict_.cbegin(); it != dict_.cend(); ++it) { - mapping[it->second] = dictIdx++; - } - - // do the transformation - for (size_t i = 0; i != idxBuffer.size(); ++i) { - idxBuffer[i] = static_cast(mapping[static_cast(idxBuffer[i])]); + for (const auto& entry : flatDict_) { + dataStream->write(entry.data->data(), entry.data->size()); + lengthEncoder->write(static_cast(entry.data->size())); } } // get dict entries in insertion order - void SortedStringDictionary::getEntriesInInsertionOrder( - std::vector& entries) const { - entries.resize(dict_.size()); - for (auto it = dict_.cbegin(); it != dict_.cend(); ++it) { - entries[it->second] = &(it->first); - } + const std::vector& + SortedStringDictionary::getEntriesInInsertionOrder() const { + return flatDict_; } // return count of entries size_t SortedStringDictionary::size() const { - return dict_.size(); + return flatDict_.size(); } // return total length of strings in the dictioanry @@ -1002,8 +1021,8 @@ namespace orc { void SortedStringDictionary::clear() { totalLength_ = 0; - data_.clear(); - dict_.clear(); + keyToIndex_.clear(); + flatDict_.clear(); } class StringColumnWriter : public ColumnWriter { @@ -1028,6 +1047,8 @@ namespace orc { virtual void reset() override; + virtual void finishStreams() override; + private: /** * dictionary related functions @@ -1221,6 +1242,14 @@ namespace orc { } } + void StringColumnWriter::finishStreams() { + ColumnWriter::finishStreams(); + if (!useDictionary) { + directDataStream->finishStream(); + directLengthEncoder->finishEncode(); + } + } + bool StringColumnWriter::checkDictionaryKeyRatio() { if (!doneDictionaryCheck) { useDictionary = dictionary.size() <= @@ -1295,9 +1324,6 @@ namespace orc { // flush dictionary data & length streams dictionary.flush(dictStream.get(), dictLengthEncoder.get()); - // convert index from insertion order to dictionary order - dictionary.reorder(dictionary.idxInDictBuffer_); - // write data sequences int64_t* data = dictionary.idxInDictBuffer_.data(); if (enableIndex) { @@ -1341,90 +1367,19 @@ namespace orc { } // get dictionary entries in insertion order - std::vector entries; - dictionary.getEntriesInInsertionOrder(entries); + const auto& entries = dictionary.getEntriesInInsertionOrder(); // store each length of the data into a vector - const SortedStringDictionary::DictEntry* dictEntry = nullptr; for (uint64_t i = 0; i != dictionary.idxInDictBuffer_.size(); ++i) { // write one row data in direct encoding - dictEntry = entries[static_cast(dictionary.idxInDictBuffer_[i])]; - directDataStream->write(dictEntry->data, dictEntry->length); - directLengthEncoder->write(static_cast(dictEntry->length)); + const auto& dictEntry = entries[static_cast(dictionary.idxInDictBuffer_[i])]; + directDataStream->write(dictEntry.data->data(), dictEntry.data->size()); + directLengthEncoder->write(static_cast(dictEntry.data->size())); } deleteDictStreams(); } - struct Utf8Utils { - /** - * Counts how many utf-8 chars of the input data - */ - static uint64_t charLength(const char* data, uint64_t length) { - uint64_t chars = 0; - for (uint64_t i = 0; i < length; i++) { - if (isUtfStartByte(data[i])) { - chars++; - } - } - return chars; - } - - /** - * Return the number of bytes required to read at most maxCharLength - * characters in full from a utf-8 encoded byte array provided - * by data. This does not validate utf-8 data, but - * operates correctly on already valid utf-8 data. - * - * @param maxCharLength number of characters required - * @param data the bytes of UTF-8 - * @param length the length of data to truncate - */ - static uint64_t truncateBytesTo(uint64_t maxCharLength, const char* data, uint64_t length) { - uint64_t chars = 0; - if (length <= maxCharLength) { - return length; - } - for (uint64_t i = 0; i < length; i++) { - if (isUtfStartByte(data[i])) { - chars++; - } - if (chars > maxCharLength) { - return i; - } - } - // everything fits - return length; - } - - /** - * Checks if b is the first byte of a UTF-8 character. - */ - inline static bool isUtfStartByte(char b) { - return (b & 0xC0) != 0x80; - } - - /** - * Find the start of the last character that ends in the current string. - * @param text the bytes of the utf-8 - * @param from the first byte location - * @param until the last byte location - * @return the index of the last character - */ - static uint64_t findLastCharacter(const char* text, uint64_t from, uint64_t until) { - uint64_t posn = until; - /* we don't expect characters more than 5 bytes */ - while (posn >= from) { - if (isUtfStartByte(text[posn])) { - return posn; - } - posn -= 1; - } - /* beginning of a valid char not found */ - throw std::logic_error("Could not truncate string, beginning of a valid char not found"); - } - }; - class CharColumnWriter : public StringColumnWriter { public: CharColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options) @@ -1639,6 +1594,8 @@ namespace orc { virtual void recordPosition() const override; + virtual void finishStreams() override; + protected: std::unique_ptr secRleEncoder, nanoRleEncoder; @@ -1779,6 +1736,12 @@ namespace orc { nanoRleEncoder->recordPosition(rowIndexPosition.get()); } + void TimestampColumnWriter::finishStreams() { + ColumnWriter::finishStreams(); + secRleEncoder->finishEncode(); + nanoRleEncoder->finishEncode(); + } + class DateColumnWriter : public IntegerColumnWriter { public: DateColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options); @@ -1848,6 +1811,8 @@ namespace orc { virtual void recordPosition() const override; + virtual void finishStreams() override; + protected: RleVersion rleVersion; uint64_t precision; @@ -1966,6 +1931,12 @@ namespace orc { scaleEncoder->recordPosition(rowIndexPosition.get()); } + void Decimal64ColumnWriter::finishStreams() { + ColumnWriter::finishStreams(); + valueStream->finishStream(); + scaleEncoder->finishEncode(); + } + class Decimal64ColumnWriterV2 : public ColumnWriter { public: Decimal64ColumnWriterV2(const Type& type, const StreamsFactory& factory, @@ -1982,6 +1953,8 @@ namespace orc { virtual void recordPosition() const override; + virtual void finishStreams() override; + protected: uint64_t precision; uint64_t scale; @@ -2072,6 +2045,11 @@ namespace orc { valueEncoder->recordPosition(rowIndexPosition.get()); } + void Decimal64ColumnWriterV2::finishStreams() { + ColumnWriter::finishStreams(); + valueEncoder->finishEncode(); + } + class Decimal128ColumnWriter : public Decimal64ColumnWriter { public: Decimal128ColumnWriter(const Type& type, const StreamsFactory& factory, @@ -2187,6 +2165,8 @@ namespace orc { virtual void reset() override; + virtual void finishStreams() override; + private: std::unique_ptr lengthEncoder_; RleVersion rleVersion_; @@ -2363,6 +2343,14 @@ namespace orc { } } + void ListColumnWriter::finishStreams() { + ColumnWriter::finishStreams(); + lengthEncoder_->finishEncode(); + if (child_) { + child_->finishStreams(); + } + } + class MapColumnWriter : public ColumnWriter { public: MapColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options); @@ -2395,6 +2383,8 @@ namespace orc { virtual void reset() override; + virtual void finishStreams() override; + private: std::unique_ptr keyWriter_; std::unique_ptr elemWriter_; @@ -2613,6 +2603,17 @@ namespace orc { } } + void MapColumnWriter::finishStreams() { + ColumnWriter::finishStreams(); + lengthEncoder_->finishEncode(); + if (keyWriter_) { + keyWriter_->finishStreams(); + } + if (elemWriter_) { + elemWriter_->finishStreams(); + } + } + class UnionColumnWriter : public ColumnWriter { public: UnionColumnWriter(const Type& type, const StreamsFactory& factory, @@ -2645,6 +2646,8 @@ namespace orc { virtual void reset() override; + virtual void finishStreams() override; + private: std::unique_ptr rleEncoder_; std::vector> children_; @@ -2816,6 +2819,73 @@ namespace orc { } } + void UnionColumnWriter::finishStreams() { + ColumnWriter::finishStreams(); + rleEncoder_->finishEncode(); + for (uint32_t i = 0; i < children_.size(); ++i) { + children_[i]->finishStreams(); + } + } + + class GeospatialColumnWriter : public BinaryColumnWriter { + public: + GeospatialColumnWriter(const Type& type, const StreamsFactory& factory, + const WriterOptions& options) + : BinaryColumnWriter(type, factory, options), + isGeometry_(type.getKind() == TypeKind::GEOMETRY) {} + + virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues, + const char* incomingMask) override { + ColumnWriter::add(rowBatch, offset, numValues, incomingMask); + + const StringVectorBatch* strBatch = dynamic_cast(&rowBatch); + if (strBatch == nullptr) { + throw InvalidArgument("Failed to cast to StringVectorBatch"); + } + auto data = &strBatch->data[offset]; + auto length = &strBatch->length[offset]; + const char* notNull = strBatch->hasNulls ? strBatch->notNull.data() + offset : nullptr; + + bool hasNull = false; + GeospatialColumnStatisticsImpl* geoStats = nullptr; + if (isGeometry_) { + geoStats = dynamic_cast(colIndexStatistics.get()); + } + + uint64_t count = 0; + for (uint64_t i = 0; i < numValues; ++i) { + if (notNull == nullptr || notNull[i]) { + uint64_t len = static_cast(length[i]); + directDataStream->write(data[i], len); + + // update stats + if (geoStats) { + ++count; + geoStats->update(data[i], len); + } + + if (enableBloomFilter) { + bloomFilter->addBytes(data[i], length[i]); + } + } else if (!hasNull) { + hasNull = true; + if (geoStats) { + geoStats->setHasNull(hasNull); + } + } + } + + directLengthEncoder->add(length, numValues, notNull); + + if (geoStats) { + geoStats->increase(count); + } + } + + private: + bool isGeometry_; + }; + std::unique_ptr buildWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options) { switch (static_cast(type.getKind())) { @@ -2886,6 +2956,9 @@ namespace orc { return std::make_unique(type, factory, options); case UNION: return std::make_unique(type, factory, options); + case GEOMETRY: + case GEOGRAPHY: + return std::make_unique(type, factory, options); default: throw NotImplementedYet( "Type is not supported yet for creating " diff --git a/c++/src/ColumnWriter.hh b/c++/src/ColumnWriter.hh index 8afd1eb72c..1c5e15d707 100644 --- a/c++/src/ColumnWriter.hh +++ b/c++/src/ColumnWriter.hh @@ -179,6 +179,18 @@ namespace orc { */ virtual void writeDictionary(); + /** + * Finalize the encoding and compressing process. This function should be + * called after all data required for encoding has been added. It ensures + * that any remaining data is processed and the final state of the streams + * is set. + * Note: boolean type cannot cut off the current byte if it is not filled + * with 8 bits, otherwise Boolean RLE may incorrectly read the unfilled + * trailing bits. In this case, the last byte will be the head of the next + * compression block. + */ + virtual void finishStreams(); + protected: /** * Utility function to translate ColumnStatistics into protobuf form and diff --git a/c++/src/Compression.cc b/c++/src/Compression.cc index 4002276e18..f373a75bff 100644 --- a/c++/src/Compression.cc +++ b/c++/src/Compression.cc @@ -52,19 +52,22 @@ namespace orc { class CompressionStreamBase : public BufferedOutputStream { public: CompressionStreamBase(OutputStream* outStream, int compressionLevel, uint64_t capacity, - uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics); + uint64_t compressionBlockSize, uint64_t memoryBlockSize, MemoryPool& pool, + WriterMetrics* metrics); virtual bool Next(void** data, int* size) override = 0; - virtual void BackUp(int count) override; + virtual void BackUp(int count) override = 0; virtual std::string getName() const override = 0; - virtual uint64_t flush() override; - virtual void suppress() override; + virtual uint64_t flush() override = 0; + virtual void suppress() override = 0; virtual bool isCompressed() const override { return true; } virtual uint64_t getSize() const override; + virtual uint64_t getRawInputBufferSize() const override = 0; + virtual void finishStream() override = 0; protected: void writeData(const unsigned char* data, int size); @@ -78,9 +81,6 @@ namespace orc { // ensure enough room for compression block header void ensureHeader(); - // Buffer to hold uncompressed data until user calls Next() - DataBuffer rawInputBuffer; - // Compress level int level; @@ -99,46 +99,26 @@ namespace orc { // Compression block header pointer array static const uint32_t HEADER_SIZE = 3; std::array header; + + // Compression block size + uint64_t compressionBlockSize; }; CompressionStreamBase::CompressionStreamBase(OutputStream* outStream, int compressionLevel, - uint64_t capacity, uint64_t blockSize, - MemoryPool& pool, WriterMetrics* metrics) - : BufferedOutputStream(pool, outStream, capacity, blockSize, metrics), - rawInputBuffer(pool, blockSize), + uint64_t capacity, uint64_t compressionBlockSize, + uint64_t memoryBlockSize, MemoryPool& pool, + WriterMetrics* metrics) + : BufferedOutputStream(pool, outStream, capacity, memoryBlockSize, metrics), level(compressionLevel), outputBuffer(nullptr), bufferSize(0), outputPosition(0), - outputSize(0) { + outputSize(0), + compressionBlockSize(compressionBlockSize) { // init header pointer array header.fill(nullptr); } - void CompressionStreamBase::BackUp(int count) { - if (count > bufferSize) { - throw std::logic_error("Can't backup that much!"); - } - bufferSize -= count; - } - - uint64_t CompressionStreamBase::flush() { - void* data; - int size; - if (!Next(&data, &size)) { - throw std::runtime_error("Failed to flush compression buffer."); - } - BufferedOutputStream::BackUp(outputSize - outputPosition); - bufferSize = outputSize = outputPosition = 0; - return BufferedOutputStream::flush(); - } - - void CompressionStreamBase::suppress() { - outputBuffer = nullptr; - bufferSize = outputPosition = outputSize = 0; - BufferedOutputStream::suppress(); - } - uint64_t CompressionStreamBase::getSize() const { return BufferedOutputStream::getSize() - static_cast(outputSize - outputPosition); } @@ -149,12 +129,12 @@ namespace orc { while (offset < size) { if (outputPosition == outputSize) { if (!BufferedOutputStream::Next(reinterpret_cast(&outputBuffer), &outputSize)) { - throw std::runtime_error("Failed to get next output buffer from output stream."); + throw CompressionError("Failed to get next output buffer from output stream."); } outputPosition = 0; } else if (outputPosition > outputSize) { // for safety this will unlikely happen - throw std::logic_error("Write to an out-of-bound place during compression!"); + throw CompressionError("Write to an out-of-bound place during compression!"); } int currentSize = std::min(outputSize - outputPosition, size - offset); memcpy(outputBuffer + outputPosition, data + offset, static_cast(currentSize)); @@ -168,7 +148,7 @@ namespace orc { for (uint32_t i = 0; i < HEADER_SIZE; ++i) { if (outputPosition >= outputSize) { if (!BufferedOutputStream::Next(reinterpret_cast(&outputBuffer), &outputSize)) { - throw std::runtime_error("Failed to get next output buffer from output stream."); + throw CompressionError("Failed to get next output buffer from output stream."); } outputPosition = 0; } @@ -183,31 +163,74 @@ namespace orc { class CompressionStream : public CompressionStreamBase { public: CompressionStream(OutputStream* outStream, int compressionLevel, uint64_t capacity, - uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics); + uint64_t compressionBlockSize, uint64_t memoryBlockSize, MemoryPool& pool, + WriterMetrics* metrics); virtual bool Next(void** data, int* size) override; virtual std::string getName() const override = 0; + virtual void BackUp(int count) override; + virtual void suppress() override; + virtual uint64_t flush() override; + uint64_t getRawInputBufferSize() const override { + return rawInputBuffer.size(); + } + virtual void finishStream() override { + compressInternal(); + BufferedOutputStream::finishStream(); + } protected: // return total compressed size virtual uint64_t doStreamingCompression() = 0; + + // Buffer to hold uncompressed data until user calls Next() + BlockBuffer rawInputBuffer; + + void compressInternal(); }; + void CompressionStream::BackUp(int count) { + uint64_t backup = static_cast(count); + uint64_t currSize = rawInputBuffer.size(); + if (backup > currSize) { + throw CompressionError("Can't backup that much!"); + } + rawInputBuffer.resize(currSize - backup); + } + + uint64_t CompressionStream::flush() { + compressInternal(); + BufferedOutputStream::BackUp(outputSize - outputPosition); + rawInputBuffer.resize(0); + outputSize = outputPosition = 0; + return BufferedOutputStream::flush(); + } + + void CompressionStream::suppress() { + outputBuffer = nullptr; + outputPosition = outputSize = 0; + rawInputBuffer.resize(0); + BufferedOutputStream::suppress(); + } + CompressionStream::CompressionStream(OutputStream* outStream, int compressionLevel, - uint64_t capacity, uint64_t blockSize, MemoryPool& pool, + uint64_t capacity, uint64_t compressionBlockSize, + uint64_t memoryBlockSize, MemoryPool& pool, WriterMetrics* metrics) - : CompressionStreamBase(outStream, compressionLevel, capacity, blockSize, pool, metrics) { + : CompressionStreamBase(outStream, compressionLevel, capacity, compressionBlockSize, + memoryBlockSize, pool, metrics), + rawInputBuffer(pool, memoryBlockSize) { // PASS } - bool CompressionStream::Next(void** data, int* size) { - if (bufferSize != 0) { + void CompressionStream::compressInternal() { + if (rawInputBuffer.size() != 0) { ensureHeader(); uint64_t preSize = getSize(); uint64_t totalCompressedSize = doStreamingCompression(); - if (totalCompressedSize >= static_cast(bufferSize)) { - writeHeader(static_cast(bufferSize), true); + if (totalCompressedSize >= static_cast(rawInputBuffer.size())) { + writeHeader(static_cast(rawInputBuffer.size()), true); // reset output buffer outputBuffer = nullptr; outputPosition = outputSize = 0; @@ -215,23 +238,42 @@ namespace orc { BufferedOutputStream::BackUp(static_cast(backup)); // copy raw input buffer into block buffer - writeData(rawInputBuffer.data(), bufferSize); + uint64_t blockNumber = rawInputBuffer.getBlockNumber(); + for (uint64_t i = 0; i < blockNumber; ++i) { + auto block = rawInputBuffer.getBlock(i); + writeData(reinterpret_cast(block.data), block.size); + } } else { writeHeader(totalCompressedSize, false); } + rawInputBuffer.resize(0); } + } - *data = rawInputBuffer.data(); - *size = static_cast(rawInputBuffer.size()); - bufferSize = *size; + bool CompressionStream::Next(void** data, int* size) { + if (rawInputBuffer.size() > compressionBlockSize) { + std::stringstream ss; + ss << "uncompressed data size " << rawInputBuffer.size() + << " is larger than compression block size " << compressionBlockSize; + throw CompressionError(ss.str()); + } + + // compress data in the rawInputBuffer when it is full + if (rawInputBuffer.size() == compressionBlockSize) { + compressInternal(); + } + auto block = rawInputBuffer.getNextBlock(); + *data = block.data; + *size = static_cast(block.size); return true; } class ZlibCompressionStream : public CompressionStream { public: - ZlibCompressionStream(OutputStream* outStream, int compressionLevel, uint64_t capacity, - uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics); + ZlibCompressionStream(OutputStream* outStream, int compressionLevel, uint64_t bufferCapacity, + uint64_t compressionBlockSize, uint64_t memoryBlockSize, MemoryPool& pool, + WriterMetrics* metrics); virtual ~ZlibCompressionStream() override { end(); @@ -249,42 +291,57 @@ namespace orc { }; ZlibCompressionStream::ZlibCompressionStream(OutputStream* outStream, int compressionLevel, - uint64_t capacity, uint64_t blockSize, - MemoryPool& pool, WriterMetrics* metrics) - : CompressionStream(outStream, compressionLevel, capacity, blockSize, pool, metrics) { + uint64_t bufferCapacity, + uint64_t compressionBlockSize, + uint64_t memoryBlockSize, MemoryPool& pool, + WriterMetrics* metrics) + : CompressionStream(outStream, compressionLevel, bufferCapacity, compressionBlockSize, + memoryBlockSize, pool, metrics) { init(); } uint64_t ZlibCompressionStream::doStreamingCompression() { if (deflateReset(&strm_) != Z_OK) { - throw std::runtime_error("Failed to reset inflate."); + throw CompressionError("Failed to reset inflate."); } - strm_.avail_in = static_cast(bufferSize); - strm_.next_in = rawInputBuffer.data(); + // iterate through all blocks + uint64_t blockId = 0; + bool finish = false; do { - if (outputPosition >= outputSize) { - if (!BufferedOutputStream::Next(reinterpret_cast(&outputBuffer), &outputSize)) { - throw std::runtime_error("Failed to get next output buffer from output stream."); - } - outputPosition = 0; + if (blockId == rawInputBuffer.getBlockNumber()) { + finish = true; + strm_.avail_in = 0; + strm_.next_in = nullptr; + } else { + auto block = rawInputBuffer.getBlock(blockId++); + strm_.avail_in = static_cast(block.size); + strm_.next_in = reinterpret_cast(block.data); } - strm_.next_out = reinterpret_cast(outputBuffer + outputPosition); - strm_.avail_out = static_cast(outputSize - outputPosition); - int ret = deflate(&strm_, Z_FINISH); - outputPosition = outputSize - static_cast(strm_.avail_out); + do { + if (outputPosition >= outputSize) { + if (!BufferedOutputStream::Next(reinterpret_cast(&outputBuffer), &outputSize)) { + throw CompressionError("Failed to get next output buffer from output stream."); + } + outputPosition = 0; + } + strm_.next_out = reinterpret_cast(outputBuffer + outputPosition); + strm_.avail_out = static_cast(outputSize - outputPosition); - if (ret == Z_STREAM_END) { - break; - } else if (ret == Z_OK) { - // needs more buffer so will continue the loop - } else { - throw std::runtime_error("Failed to deflate input data."); - } - } while (strm_.avail_out == 0); + int ret = deflate(&strm_, finish ? Z_FINISH : Z_NO_FLUSH); + outputPosition = outputSize - static_cast(strm_.avail_out); + if (ret == Z_STREAM_END) { + break; + } else if (ret == Z_OK) { + // needs more buffer so will continue the loop + } else { + throw CompressionError("Failed to deflate input data."); + } + } while (strm_.avail_out == 0); + } while (!finish); return strm_.total_out; } @@ -305,7 +362,7 @@ namespace orc { strm_.next_in = nullptr; if (deflateInit2(&strm_, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY) != Z_OK) { - throw std::runtime_error("Error while calling deflateInit2() for zlib."); + throw CompressionError("Error while calling deflateInit2() for zlib."); } } @@ -505,7 +562,7 @@ namespace orc { } else if (state == DECOMPRESS_START) { NextDecompress(data, size, availableSize); } else { - throw std::logic_error( + throw CompressionError( "Unknown compression state in " "DecompressionStream::Next"); } @@ -519,7 +576,7 @@ namespace orc { void DecompressionStream::BackUp(int count) { if (outputBuffer == nullptr || outputBufferLength != 0) { - throw std::logic_error("Backup without previous Next in " + getName()); + throw CompressionError("Backup without previous Next in " + getName()); } outputBuffer -= static_cast(count); outputBufferLength = static_cast(count); @@ -647,13 +704,17 @@ namespace orc { case Z_OK: break; case Z_MEM_ERROR: - throw std::logic_error("Memory error from inflateInit2"); + throw CompressionError( + "Memory error from ZlibDecompressionStream::ZlibDecompressionStream inflateInit2"); case Z_VERSION_ERROR: - throw std::logic_error("Version error from inflateInit2"); + throw CompressionError( + "Version error from ZlibDecompressionStream::ZlibDecompressionStream inflateInit2"); case Z_STREAM_ERROR: - throw std::logic_error("Stream error from inflateInit2"); + throw CompressionError( + "Stream error from ZlibDecompressionStream::ZlibDecompressionStream inflateInit2"); default: - throw std::logic_error("Unknown error from inflateInit2"); + throw CompressionError( + "Unknown error from ZlibDecompressionStream::ZlibDecompressionStream inflateInit2"); } } @@ -674,7 +735,7 @@ namespace orc { zstream_.next_out = reinterpret_cast(const_cast(outputBuffer)); zstream_.avail_out = static_cast(outputDataBuffer.capacity()); if (inflateReset(&zstream_) != Z_OK) { - throw std::logic_error( + throw CompressionError( "Bad inflateReset in " "ZlibDecompressionStream::NextDecompress"); } @@ -694,19 +755,19 @@ namespace orc { case Z_STREAM_END: break; case Z_BUF_ERROR: - throw std::logic_error( + throw CompressionError( "Buffer error in " "ZlibDecompressionStream::NextDecompress"); case Z_DATA_ERROR: - throw std::logic_error( + throw CompressionError( "Data error in " "ZlibDecompressionStream::NextDecompress"); case Z_STREAM_ERROR: - throw std::logic_error( + throw CompressionError( "Stream error in " "ZlibDecompressionStream::NextDecompress"); default: - throw std::logic_error( + throw CompressionError( "Unknown error in " "ZlibDecompressionStream::NextDecompress"); } @@ -812,7 +873,7 @@ namespace orc { } if (outLength > maxOutputLength) { - throw std::logic_error("Snappy length exceeds block size"); + throw CompressionError("Snappy length exceeds block size"); } if (!snappy::RawUncompress(input, length, output)) { @@ -881,14 +942,23 @@ namespace orc { public: BlockCompressionStream(OutputStream* outStream, int compressionLevel, uint64_t capacity, uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics) - : CompressionStreamBase(outStream, compressionLevel, capacity, blockSize, pool, metrics), - compressorBuffer(pool) { + : CompressionStreamBase(outStream, compressionLevel, capacity, blockSize, blockSize, pool, + metrics), + compressorBuffer(pool), + rawInputBuffer(pool, blockSize) { // PASS } virtual bool Next(void** data, int* size) override; virtual void suppress() override; + virtual void BackUp(int count) override; + virtual uint64_t flush() override; virtual std::string getName() const override = 0; + uint64_t getRawInputBufferSize() const override { + return bufferSize; + } + + virtual void finishStream() override; protected: // compresses a block and returns the compressed size @@ -900,8 +970,23 @@ namespace orc { // should allocate max possible compressed size DataBuffer compressorBuffer; + + // Buffer to hold uncompressed data until user calls Next() + DataBuffer rawInputBuffer; }; + void BlockCompressionStream::BackUp(int count) { + if (count > bufferSize) { + throw CompressionError("Can't backup that much!"); + } + bufferSize -= count; + } + + uint64_t BlockCompressionStream::flush() { + finishStream(); + return BufferedOutputStream::flush(); + } + bool BlockCompressionStream::Next(void** data, int* size) { if (bufferSize != 0) { ensureHeader(); @@ -935,7 +1020,19 @@ namespace orc { void BlockCompressionStream::suppress() { compressorBuffer.resize(0); - CompressionStreamBase::suppress(); + outputBuffer = nullptr; + bufferSize = outputPosition = outputSize = 0; + BufferedOutputStream::suppress(); + } + + void BlockCompressionStream::finishStream() { + void* data; + int size; + if (!Next(&data, &size)) { + throw CompressionError("Failed to flush compression buffer."); + } + BufferedOutputStream::BackUp(outputSize - outputPosition); + bufferSize = outputSize = outputPosition = 0; } /** @@ -976,7 +1073,7 @@ namespace orc { reinterpret_cast(compressorBuffer.data()), bufferSize, static_cast(compressorBuffer.size()), level); if (result == 0) { - throw std::runtime_error("Error during block compression using lz4."); + throw CompressionError("Error during block compression using lz4."); } return static_cast(result); } @@ -984,7 +1081,7 @@ namespace orc { void Lz4CompressionSteam::init() { state_ = LZ4_createStream(); if (!state_) { - throw std::runtime_error("Error while allocating state for lz4."); + throw CompressionError("Error while allocating state for lz4."); } } @@ -1072,7 +1169,7 @@ namespace orc { void ZSTDCompressionStream::init() { cctx_ = ZSTD_createCCtx(); if (!cctx_) { - throw std::runtime_error("Error while calling ZSTD_createCCtx() for zstd."); + throw CompressionError("Error while calling ZSTD_createCCtx() for zstd."); } } @@ -1129,7 +1226,7 @@ namespace orc { void ZSTDDecompressionStream::init() { dctx_ = ZSTD_createDCtx(); if (!dctx_) { - throw std::runtime_error("Error while calling ZSTD_createDCtx() for zstd."); + throw CompressionError("Error while calling ZSTD_createDCtx() for zstd."); } } @@ -1140,12 +1237,10 @@ namespace orc { DIAGNOSTIC_PUSH - std::unique_ptr createCompressor(CompressionKind kind, - OutputStream* outStream, - CompressionStrategy strategy, - uint64_t bufferCapacity, - uint64_t compressionBlockSize, - MemoryPool& pool, WriterMetrics* metrics) { + std::unique_ptr createCompressor( + CompressionKind kind, OutputStream* outStream, CompressionStrategy strategy, + uint64_t bufferCapacity, uint64_t compressionBlockSize, uint64_t memoryBlockSize, + MemoryPool& pool, WriterMetrics* metrics) { switch (static_cast(kind)) { case CompressionKind_NONE: { return std::make_unique(pool, outStream, bufferCapacity, @@ -1154,8 +1249,8 @@ namespace orc { case CompressionKind_ZLIB: { int level = (strategy == CompressionStrategy_SPEED) ? Z_BEST_SPEED + 1 : Z_DEFAULT_COMPRESSION; - return std::make_unique(outStream, level, bufferCapacity, - compressionBlockSize, pool, metrics); + return std::make_unique( + outStream, level, bufferCapacity, compressionBlockSize, memoryBlockSize, pool, metrics); } case CompressionKind_ZSTD: { int level = (strategy == CompressionStrategy_SPEED) ? 1 : ZSTD_CLEVEL_DEFAULT; diff --git a/c++/src/Compression.hh b/c++/src/Compression.hh index 55b152dd63..24170c56b4 100644 --- a/c++/src/Compression.hh +++ b/c++/src/Compression.hh @@ -42,15 +42,16 @@ namespace orc { * @param outStream the output stream that is the underlying target * @param strategy compression strategy * @param bufferCapacity compression stream buffer total capacity - * @param compressionBlockSize compression buffer block size + * @param compressionBlockSize compression is triggered when the original input buffer size + * reaches this size + * @param memoryBlockSize the block size for original input buffer * @param pool the memory pool + * @param metrics the writer metrics */ - std::unique_ptr createCompressor(CompressionKind kind, - OutputStream* outStream, - CompressionStrategy strategy, - uint64_t bufferCapacity, - uint64_t compressionBlockSize, - MemoryPool& pool, WriterMetrics* metrics); + std::unique_ptr createCompressor( + CompressionKind kind, OutputStream* outStream, CompressionStrategy strategy, + uint64_t bufferCapacity, uint64_t compressionBlockSize, uint64_t memoryBlockSize, + MemoryPool& pool, WriterMetrics* metrics); } // namespace orc #endif diff --git a/c++/src/ConvertColumnReader.cc b/c++/src/ConvertColumnReader.cc index 67ee6d6c45..7db5b88954 100644 --- a/c++/src/ConvertColumnReader.cc +++ b/c++/src/ConvertColumnReader.cc @@ -17,6 +17,9 @@ */ #include "ConvertColumnReader.hh" +#include "Utils.hh" + +#include namespace orc { @@ -72,6 +75,23 @@ namespace orc { } } + static inline void handleParseFromStringError(ColumnVectorBatch& dstBatch, uint64_t idx, + bool shouldThrow, const std::string& typeName, + const std::string& str, + const std::string& expectedFormat = "") { + if (!shouldThrow) { + dstBatch.notNull.data()[idx] = 0; + dstBatch.hasNulls = true; + } else { + std::ostringstream ss; + ss << "Failed to parse " << typeName << " from string:" << str; + if (expectedFormat != "") { + ss << " the following format \"" << expectedFormat << "\" is expected"; + } + throw SchemaEvolutionError(ss.str()); + } + } + // return false if overflow template static bool downCastToInteger(ReadType& dstValue, int64_t inputLong) { @@ -106,13 +126,13 @@ namespace orc { bool shouldThrow) { constexpr bool isFileTypeFloatingPoint(std::is_floating_point::value); constexpr bool isReadTypeFloatingPoint(std::is_floating_point::value); - int64_t longValue = static_cast(srcValue); + if (isFileTypeFloatingPoint) { if (isReadTypeFloatingPoint) { destValue = static_cast(srcValue); } else { if (!canFitInLong(static_cast(srcValue)) || - !downCastToInteger(destValue, longValue)) { + !downCastToInteger(destValue, static_cast(srcValue))) { handleOverflow(destBatch, idx, shouldThrow); } } @@ -399,13 +419,14 @@ namespace orc { ConvertToTimestampColumnReader(const Type& readType, const Type& fileType, StripeStreams& stripe, bool throwOnOverflow) : ConvertColumnReader(readType, fileType, stripe, throwOnOverflow), - readerTimezone(readType.getKind() == TIMESTAMP_INSTANT ? &getTimezoneByName("GMT") - : &stripe.getReaderTimezone()), + isInstant(readType.getKind() == TIMESTAMP_INSTANT), + readerTimezone(isInstant ? &getTimezoneByName("GMT") : &stripe.getReaderTimezone()), needConvertTimezone(readerTimezone != &getTimezoneByName("GMT")) {} void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override; protected: + const bool isInstant; const orc::Timezone* readerTimezone; const bool needConvertTimezone; }; @@ -558,6 +579,8 @@ namespace orc { const auto& srcBatch = *SafeCastBatchTo(data.get()); auto& dstBatch = *SafeCastBatchTo(&rowBatch); + dstBatch.precision = toPrecision_; + dstBatch.scale = toScale_; for (uint64_t i = 0; i < numValues; ++i) { if (!rowBatch.hasNulls || rowBatch.notNull[i]) { convertDecimalToDecimal(dstBatch, i, srcBatch); @@ -694,6 +717,318 @@ namespace orc { const int32_t scale_; }; + template + class StringVariantToNumericColumnReader : public ConvertColumnReader { + public: + StringVariantToNumericColumnReader(const Type& readType, const Type& fileType, + StripeStreams& stripe, bool throwOnOverflow) + : ConvertColumnReader(readType, fileType, stripe, throwOnOverflow) {} + + void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override { + ConvertColumnReader::next(rowBatch, numValues, notNull); + + const auto& srcBatch = *SafeCastBatchTo(data.get()); + auto& dstBatch = *SafeCastBatchTo(&rowBatch); + for (uint64_t i = 0; i < numValues; ++i) { + if (!rowBatch.hasNulls || rowBatch.notNull[i]) { + if constexpr (std::is_floating_point_v) { + convertToDouble(dstBatch, srcBatch, i); + } else { + convertToInteger(dstBatch, srcBatch, i); + } + } + } + } + + private: + void convertToInteger(ReadTypeBatch& dstBatch, const StringVectorBatch& srcBatch, + uint64_t idx) { + int64_t longValue = 0; + const std::string longStr(srcBatch.data[idx], srcBatch.length[idx]); + try { + longValue = std::stoll(longStr); + } catch (...) { + handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Long", longStr); + return; + } + if constexpr (std::is_same_v) { + dstBatch.data[idx] = longValue == 0 ? 0 : 1; + } else { + if (!downCastToInteger(dstBatch.data[idx], longValue)) { + handleOverflow(dstBatch, idx, throwOnOverflow); + } + } + } + + void convertToDouble(ReadTypeBatch& dstBatch, const StringVectorBatch& srcBatch, uint64_t idx) { + const std::string floatValue(srcBatch.data[idx], srcBatch.length[idx]); + try { + if constexpr (std::is_same_v) { + dstBatch.data[idx] = std::stof(floatValue); + } else { + dstBatch.data[idx] = std::stod(floatValue); + } + } catch (...) { + handleParseFromStringError(dstBatch, idx, throwOnOverflow, typeid(readType).name(), + floatValue); + } + } + }; + + class StringVariantConvertColumnReader : public ConvertToStringVariantColumnReader { + public: + StringVariantConvertColumnReader(const Type& readType, const Type& fileType, + StripeStreams& stripe, bool throwOnOverflow) + : ConvertToStringVariantColumnReader(readType, fileType, stripe, throwOnOverflow) {} + + uint64_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override { + uint64_t size = 0; + strBuffer.resize(numValues); + const auto& srcBatch = *SafeCastBatchTo(data.get()); + const auto maxLength = readType.getMaximumLength(); + if (readType.getKind() == STRING) { + for (uint64_t i = 0; i < numValues; ++i) { + if (!rowBatch.hasNulls || rowBatch.notNull[i]) { + strBuffer[i] = std::string(srcBatch.data[i], srcBatch.length[i]); + size += strBuffer[i].size(); + } + } + } else if (readType.getKind() == VARCHAR) { + for (uint64_t i = 0; i < numValues; ++i) { + if (!rowBatch.hasNulls || rowBatch.notNull[i]) { + const char* charData = srcBatch.data[i]; + uint64_t originLength = srcBatch.length[i]; + uint64_t itemLength = Utf8Utils::truncateBytesTo(maxLength, charData, originLength); + strBuffer[i] = std::string(charData, itemLength); + size += strBuffer[i].length(); + } + } + } else if (readType.getKind() == CHAR) { + for (uint64_t i = 0; i < numValues; ++i) { + if (!rowBatch.hasNulls || rowBatch.notNull[i]) { + const char* charData = srcBatch.data[i]; + uint64_t originLength = srcBatch.length[i]; + uint64_t charLength = Utf8Utils::charLength(charData, originLength); + auto itemLength = Utf8Utils::truncateBytesTo(maxLength, charData, originLength); + strBuffer[i] = std::string(srcBatch.data[i], itemLength); + // the padding is exactly 1 byte per char + if (charLength < maxLength) { + strBuffer[i].resize(itemLength + maxLength - charLength, ' '); + } + size += strBuffer[i].length(); + } + } + } else { + throw SchemaEvolutionError("Invalid type for numeric to string conversion: " + + readType.toString()); + } + return size; + } + }; + + class StringVariantToTimestampColumnReader : public ConvertToTimestampColumnReader { + public: + StringVariantToTimestampColumnReader(const Type& readType, const Type& fileType, + StripeStreams& stripe, bool throwOnOverflow) + : ConvertToTimestampColumnReader(readType, fileType, stripe, throwOnOverflow) {} + + void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override { + ConvertToTimestampColumnReader::next(rowBatch, numValues, notNull); + + const auto& srcBatch = *SafeCastBatchTo(data.get()); + auto& dstBatch = *SafeCastBatchTo(&rowBatch); + + for (uint64_t i = 0; i < numValues; ++i) { + if (!rowBatch.hasNulls || rowBatch.notNull[i]) { + convertToTimestamp(dstBatch, i, std::string(srcBatch.data[i], srcBatch.length[i])); + } + } + } + + private: + // Algorithm: http://howardhinnant.github.io/date_algorithms.html + // The algorithm implements a proleptic Gregorian calendar. + int64_t daysFromProlepticGregorianCalendar(int32_t y, int32_t m, int32_t d) { + y -= m <= 2; + int32_t era = y / 400; + int32_t yoe = y - era * 400; // [0, 399] + int32_t doy = (153 * (m + (m > 2 ? -3 : 9)) + 2) / 5 + d - 1; // [0, 365] + int32_t doe = yoe * 365 + yoe / 4 - yoe / 100 + doy; // [0, 146096] + return 1ll * era * 146097 + doe - 719468; + } + + std::optional> tryBestToParseFromString( + const std::string& timeStr) { + int32_t year, month, day, hour, min, sec, nanos = 0; + int32_t matched = std::sscanf(timeStr.c_str(), "%4d-%2d-%2d %2d:%2d:%2d.%d", &year, &month, + &day, &hour, &min, &sec, &nanos); + if (matched != 6 && matched != 7) { + return std::nullopt; + } + if (nanos) { + if (nanos < 0 || nanos >= 1e9) { + return std::nullopt; + } + while (nanos < static_cast(1e8)) { + nanos *= 10; + } + } + int64_t daysSinceEpoch = daysFromProlepticGregorianCalendar(year, month, day); + int64_t secondSinceEpoch = 60ll * (60 * (24L * daysSinceEpoch + hour) + min) + sec; + return std::make_optional(std::pair{secondSinceEpoch, nanos}); + } + + void convertToTimestamp(TimestampVectorBatch& dstBatch, uint64_t idx, + const std::string& timeStr) { + // Expected timestamp_instant format string : yyyy-mm-dd hh:mm:ss[.xxx] timezone + // Eg. "2019-07-09 13:11:00 America/Los_Angeles" + // Expected timestamp format string : yyyy-mm-dd hh:mm:ss[.xxx] + // Eg. "2019-07-09 13:11:00" + static std::string expectedTimestampInstantFormat = "yyyy-mm-dd hh:mm:ss[.xxx] timezone"; + static std::string expectedTimestampFormat = "yyyy-mm-dd hh:mm:ss[.xxx]"; + auto timestamp = tryBestToParseFromString(timeStr); + if (!timestamp.has_value()) { + if (!isInstant) { + handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Timestamp", timeStr, + expectedTimestampFormat); + return; + } + handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Timestamp_Instant", timeStr, + expectedTimestampInstantFormat); + return; + } + + auto& [second, nanos] = timestamp.value(); + + if (isInstant) { + size_t pos = 0; // get the name of timezone + pos = timeStr.find(' ', pos) + 1; + pos = timeStr.find(' ', pos); + if (pos == std::string::npos) { + handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Timestamp_Instant", timeStr, + expectedTimestampInstantFormat); + return; + } + pos += 1; + size_t subStrLength = timeStr.length() - pos; + try { + second = getTimezoneByName(timeStr.substr(pos, subStrLength)).convertFromUTC(second); + } catch (const TimezoneError&) { + handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Timestamp_Instant", timeStr, + expectedTimestampInstantFormat); + return; + } + } else { + if (needConvertTimezone) { + second = readerTimezone->convertFromUTC(second); + } + } + dstBatch.data[idx] = second; + dstBatch.nanoseconds[idx] = nanos; + } + }; + + template + class StringVariantToDecimalColumnReader : public ConvertColumnReader { + public: + StringVariantToDecimalColumnReader(const Type& readType, const Type& fileType, + StripeStreams& stripe, bool throwOnOverflow) + : ConvertColumnReader(readType, fileType, stripe, throwOnOverflow), + precision_(static_cast(readType.getPrecision())), + scale_(static_cast(readType.getScale())) {} + + void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override { + ConvertColumnReader::next(rowBatch, numValues, notNull); + + const auto& srcBatch = *SafeCastBatchTo(data.get()); + auto& dstBatch = *SafeCastBatchTo(&rowBatch); + for (uint64_t i = 0; i < numValues; ++i) { + if (!rowBatch.hasNulls || rowBatch.notNull[i]) { + convertToDecimal(dstBatch, i, std::string(srcBatch.data[i], srcBatch.length[i])); + } + } + } + + private: + void convertToDecimal(ReadTypeBatch& dstBatch, uint64_t idx, const std::string& decimalStr) { + constexpr int32_t MAX_PRECISION_128 = 38; + int32_t fromPrecision = 0; + int32_t fromScale = 0; + uint32_t start = 0; + bool negative = false; + if (decimalStr.empty()) { + handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Decimal", decimalStr); + return; + } + auto dotPos = decimalStr.find('.'); + if (dotPos == std::string::npos) { + fromScale = 0; + fromPrecision = decimalStr.length(); + dotPos = decimalStr.length(); + } else { + if (dotPos + 1 == decimalStr.length()) { + handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Decimal", decimalStr); + return; + } + fromPrecision = decimalStr.length() - 1; + fromScale = decimalStr.length() - dotPos - 1; + } + if (decimalStr.front() == '-') { + negative = true; + start++; + fromPrecision--; + } + const std::string integerPortion = decimalStr.substr(start, dotPos - start); + if (dotPos == start || fromPrecision > MAX_PRECISION_128 || fromPrecision <= 0 || + !std::all_of(integerPortion.begin(), integerPortion.end(), ::isdigit)) { + handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Decimal", decimalStr); + return; + } + + Int128 i128; + try { + bool overflow = false; + i128 = Int128(integerPortion); + // overflow won't happen + i128 *= scaleUpInt128ByPowerOfTen(Int128(1), fromScale, overflow); + } catch (const std::exception& e) { + handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Decimal", decimalStr); + return; + } + if (dotPos + 1 < decimalStr.length()) { + const std::string fractionPortion = decimalStr.substr(dotPos + 1, fromScale); + if (!std::all_of(fractionPortion.begin(), fractionPortion.end(), ::isdigit)) { + handleOverflow(dstBatch, idx, throwOnOverflow); + return; + } + i128 += Int128(fractionPortion); + } + + auto [overflow, result] = convertDecimal(i128, fromScale, precision_, scale_); + if (overflow) { + handleOverflow(dstBatch, idx, throwOnOverflow); + return; + } + if (negative) { + result.negate(); + } + + if constexpr (std::is_same_v) { + dstBatch.values[idx] = result; + } else { + if (!result.fitsInLong()) { + handleOverflow(dstBatch, idx, + throwOnOverflow); + } else { + dstBatch.values[idx] = result.toLong(); + } + } + } + + const int32_t precision_; + const int32_t scale_; + }; + #define DEFINE_NUMERIC_CONVERT_READER(FROM, TO, TYPE) \ using FROM##To##TO##ColumnReader = \ NumericConvertColumnReader; @@ -730,6 +1065,18 @@ namespace orc { using Decimal64To##TO##ColumnReader = DecimalToStringVariantColumnReader; \ using Decimal128To##TO##ColumnReader = DecimalToStringVariantColumnReader; +#define DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(FROM, TO, TYPE) \ + using FROM##To##TO##ColumnReader = StringVariantToNumericColumnReader; + +#define DEFINE_STRING_VARIANT_CONVERT_READER(FROM, TO) \ + using FROM##To##TO##ColumnReader = StringVariantConvertColumnReader; + +#define DEFINE_STRING_VARIANT_CONVERT_TO_TIMESTAMP_READER(FROM, TO) \ + using FROM##To##TO##ColumnReader = StringVariantToTimestampColumnReader; + +#define DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(FROM, TO) \ + using FROM##To##TO##ColumnReader = StringVariantToDecimalColumnReader; + DEFINE_NUMERIC_CONVERT_READER(Boolean, Byte, int8_t) DEFINE_NUMERIC_CONVERT_READER(Boolean, Short, int16_t) DEFINE_NUMERIC_CONVERT_READER(Boolean, Int, int32_t) @@ -834,8 +1181,57 @@ namespace orc { DEFINE_DECIMAL_CONVERT_TO_STRING_VARINT_READER(Char) DEFINE_DECIMAL_CONVERT_TO_STRING_VARINT_READER(Varchar) + // String variant to numeric + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Boolean, bool) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Byte, int8_t) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Short, int16_t) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Int, int32_t) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Long, int64_t) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Float, float) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Double, double) + + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Boolean, bool) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Byte, int8_t) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Short, int16_t) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Int, int32_t) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Long, int64_t) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Float, float) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Double, double) + + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Boolean, bool) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Byte, int8_t) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Short, int16_t) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Int, int32_t) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Long, int64_t) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Float, float) + DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Double, double) + + // String variant to string variant + DEFINE_STRING_VARIANT_CONVERT_READER(String, String) + DEFINE_STRING_VARIANT_CONVERT_READER(String, Char) + DEFINE_STRING_VARIANT_CONVERT_READER(String, Varchar) + DEFINE_STRING_VARIANT_CONVERT_READER(Char, Char) + DEFINE_STRING_VARIANT_CONVERT_READER(Char, String) + DEFINE_STRING_VARIANT_CONVERT_READER(Char, Varchar) + DEFINE_STRING_VARIANT_CONVERT_READER(Varchar, String) + DEFINE_STRING_VARIANT_CONVERT_READER(Varchar, Char) + DEFINE_STRING_VARIANT_CONVERT_READER(Varchar, Varchar) + + // String variant to timestamp + DEFINE_STRING_VARIANT_CONVERT_TO_TIMESTAMP_READER(String, Timestamp) + DEFINE_STRING_VARIANT_CONVERT_TO_TIMESTAMP_READER(Char, Timestamp) + DEFINE_STRING_VARIANT_CONVERT_TO_TIMESTAMP_READER(Varchar, Timestamp) + + // String variant to decimal + DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(String, Decimal64) + DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(String, Decimal128) + DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(Char, Decimal64) + DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(Char, Decimal128) + DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(Varchar, Decimal64) + DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(Varchar, Decimal128) + #define CREATE_READER(NAME) \ - return std::make_unique(_readType, fileType, stripe, throwOnOverflow); + return std::make_unique(readType, fileType, stripe, throwOnOverflow); #define CASE_CREATE_READER(TYPE, CONVERT) \ case TYPE: \ @@ -858,7 +1254,7 @@ namespace orc { #define CASE_CREATE_DECIMAL_READER(FROM) \ case DECIMAL: { \ - if (isDecimal64(_readType)) { \ + if (isDecimal64(readType)) { \ CREATE_READER(FROM##ToDecimal64ColumnReader) \ } else { \ CREATE_READER(FROM##ToDecimal128ColumnReader) \ @@ -868,7 +1264,7 @@ namespace orc { #define CASE_EXCEPTION \ default: \ throw SchemaEvolutionError("Cannot convert from " + fileType.toString() + " to " + \ - _readType.toString()); + readType.toString()); std::unique_ptr buildConvertReader(const Type& fileType, StripeStreams& stripe, bool useTightNumericVector, @@ -878,11 +1274,11 @@ namespace orc { "SchemaEvolution only support tight vector, please create ColumnVectorBatch with " "option useTightNumericVector"); } - const auto& _readType = *stripe.getSchemaEvolution()->getReadType(fileType); + const auto& readType = *stripe.getSchemaEvolution()->getReadType(fileType); switch (fileType.getKind()) { case BOOLEAN: { - switch (_readType.getKind()) { + switch (readType.getKind()) { CASE_CREATE_READER(BYTE, BooleanToByte) CASE_CREATE_READER(SHORT, BooleanToShort) CASE_CREATE_READER(INT, BooleanToInt) @@ -906,7 +1302,7 @@ namespace orc { } } case BYTE: { - switch (_readType.getKind()) { + switch (readType.getKind()) { CASE_CREATE_READER(BOOLEAN, ByteToBoolean) CASE_CREATE_READER(SHORT, ByteToShort) CASE_CREATE_READER(INT, ByteToInt) @@ -930,7 +1326,7 @@ namespace orc { } } case SHORT: { - switch (_readType.getKind()) { + switch (readType.getKind()) { CASE_CREATE_READER(BOOLEAN, ShortToBoolean) CASE_CREATE_READER(BYTE, ShortToByte) CASE_CREATE_READER(INT, ShortToInt) @@ -954,7 +1350,7 @@ namespace orc { } } case INT: { - switch (_readType.getKind()) { + switch (readType.getKind()) { CASE_CREATE_READER(BOOLEAN, IntToBoolean) CASE_CREATE_READER(BYTE, IntToByte) CASE_CREATE_READER(SHORT, IntToShort) @@ -978,7 +1374,7 @@ namespace orc { } } case LONG: { - switch (_readType.getKind()) { + switch (readType.getKind()) { CASE_CREATE_READER(BOOLEAN, LongToBoolean) CASE_CREATE_READER(BYTE, LongToByte) CASE_CREATE_READER(SHORT, LongToShort) @@ -1002,7 +1398,7 @@ namespace orc { } } case FLOAT: { - switch (_readType.getKind()) { + switch (readType.getKind()) { CASE_CREATE_READER(BOOLEAN, FloatToBoolean) CASE_CREATE_READER(BYTE, FloatToByte) CASE_CREATE_READER(SHORT, FloatToShort) @@ -1026,7 +1422,7 @@ namespace orc { } } case DOUBLE: { - switch (_readType.getKind()) { + switch (readType.getKind()) { CASE_CREATE_READER(BOOLEAN, DoubleToBoolean) CASE_CREATE_READER(BYTE, DoubleToByte) CASE_CREATE_READER(SHORT, DoubleToShort) @@ -1050,7 +1446,7 @@ namespace orc { } } case DECIMAL: { - switch (_readType.getKind()) { + switch (readType.getKind()) { CASE_CREATE_FROM_DECIMAL_READER(BOOLEAN, Boolean) CASE_CREATE_FROM_DECIMAL_READER(BYTE, Byte) CASE_CREATE_FROM_DECIMAL_READER(SHORT, Short) @@ -1065,13 +1461,13 @@ namespace orc { CASE_CREATE_FROM_DECIMAL_READER(TIMESTAMP_INSTANT, Timestamp) case DECIMAL: { if (isDecimal64(fileType)) { - if (isDecimal64(_readType)) { + if (isDecimal64(readType)) { CREATE_READER(Decimal64ToDecimal64ColumnReader) } else { CREATE_READER(Decimal64ToDecimal128ColumnReader) } } else { - if (isDecimal64(_readType)) { + if (isDecimal64(readType)) { CREATE_READER(Decimal128ToDecimal64ColumnReader) } else { CREATE_READER(Decimal128ToDecimal128ColumnReader) @@ -1087,7 +1483,96 @@ namespace orc { CASE_EXCEPTION } } - case STRING: + case STRING: { + switch (readType.getKind()) { + CASE_CREATE_READER(BOOLEAN, StringToBoolean) + CASE_CREATE_READER(BYTE, StringToByte) + CASE_CREATE_READER(SHORT, StringToShort) + CASE_CREATE_READER(INT, StringToInt) + CASE_CREATE_READER(LONG, StringToLong) + CASE_CREATE_READER(FLOAT, StringToFloat) + CASE_CREATE_READER(DOUBLE, StringToDouble) + CASE_CREATE_READER(STRING, StringToString) + CASE_CREATE_READER(CHAR, StringToChar) + CASE_CREATE_READER(VARCHAR, StringToVarchar) + CASE_CREATE_READER(TIMESTAMP, StringToTimestamp) + CASE_CREATE_READER(TIMESTAMP_INSTANT, StringToTimestamp) + case DECIMAL: { + if (isDecimal64(readType)) { + CREATE_READER(StringToDecimal64ColumnReader) + } else { + CREATE_READER(StringToDecimal128ColumnReader) + } + } + case BINARY: + case LIST: + case MAP: + case STRUCT: + case UNION: + case DATE: + CASE_EXCEPTION + } + } + case CHAR: { + switch (readType.getKind()) { + CASE_CREATE_READER(BOOLEAN, CharToBoolean) + CASE_CREATE_READER(BYTE, CharToByte) + CASE_CREATE_READER(SHORT, CharToShort) + CASE_CREATE_READER(INT, CharToInt) + CASE_CREATE_READER(LONG, CharToLong) + CASE_CREATE_READER(FLOAT, CharToFloat) + CASE_CREATE_READER(DOUBLE, CharToDouble) + CASE_CREATE_READER(STRING, CharToString) + CASE_CREATE_READER(CHAR, CharToChar) + CASE_CREATE_READER(VARCHAR, CharToVarchar) + CASE_CREATE_READER(TIMESTAMP, CharToTimestamp) + CASE_CREATE_READER(TIMESTAMP_INSTANT, CharToTimestamp) + case DECIMAL: { + if (isDecimal64(readType)) { + CREATE_READER(CharToDecimal64ColumnReader) + } else { + CREATE_READER(CharToDecimal128ColumnReader) + } + } + case BINARY: + case LIST: + case MAP: + case STRUCT: + case UNION: + case DATE: + CASE_EXCEPTION + } + } + case VARCHAR: { + switch (readType.getKind()) { + CASE_CREATE_READER(BOOLEAN, VarcharToBoolean) + CASE_CREATE_READER(BYTE, VarcharToByte) + CASE_CREATE_READER(SHORT, VarcharToShort) + CASE_CREATE_READER(INT, VarcharToInt) + CASE_CREATE_READER(LONG, VarcharToLong) + CASE_CREATE_READER(FLOAT, VarcharToFloat) + CASE_CREATE_READER(DOUBLE, VarcharToDouble) + CASE_CREATE_READER(STRING, VarcharToString) + CASE_CREATE_READER(CHAR, VarcharToChar) + CASE_CREATE_READER(VARCHAR, VarcharToVarchar) + CASE_CREATE_READER(TIMESTAMP, VarcharToTimestamp) + CASE_CREATE_READER(TIMESTAMP_INSTANT, VarcharToTimestamp) + case DECIMAL: { + if (isDecimal64(readType)) { + CREATE_READER(VarcharToDecimal64ColumnReader) + } else { + CREATE_READER(VarcharToDecimal128ColumnReader) + } + } + case BINARY: + case LIST: + case MAP: + case STRUCT: + case UNION: + case DATE: + CASE_EXCEPTION + } + } case BINARY: case TIMESTAMP: case LIST: @@ -1095,21 +1580,9 @@ namespace orc { case STRUCT: case UNION: case DATE: - case VARCHAR: - case CHAR: case TIMESTAMP_INSTANT: CASE_EXCEPTION } } -#undef DEFINE_NUMERIC_CONVERT_READER -#undef DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER -#undef DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER -#undef DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER -#undef DEFINE_DECIMAL_CONVERT_TO_NUMERIC_READER -#undef DEFINE_DECIMAL_CONVERT_TO_DECIMAL_READER -#undef CASE_CREATE_FROM_DECIMAL_READER -#undef CASE_CREATE_READER -#undef CASE_EXCEPTION - } // namespace orc diff --git a/c++/src/CpuInfoUtil.cc b/c++/src/CpuInfoUtil.cc index 82669de20a..588f8dc96a 100644 --- a/c++/src/CpuInfoUtil.cc +++ b/c++/src/CpuInfoUtil.cc @@ -74,7 +74,7 @@ namespace orc { #if defined(_WIN32) //------------------------------ WINDOWS ------------------------------// - void OsRetrieveCacheSize(std::array* cache_sizes) { + void OsRetrieveCacheSize(std::array* cacheSizes) { PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = nullptr; PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer_position = nullptr; DWORD buffer_size = 0; @@ -108,8 +108,8 @@ namespace orc { if (RelationCache == buffer_position->Relationship) { PCACHE_DESCRIPTOR cache = &buffer_position->Cache; if (cache->Level >= 1 && cache->Level <= kCacheLevels) { - const int64_t current = (*cache_sizes)[cache->Level - 1]; - (*cache_sizes)[cache->Level - 1] = std::max(current, cache->Size); + const int64_t current = (*cacheSizes)[cache->Level - 1]; + (*cacheSizes)[cache->Level - 1] = std::max(current, cache->Size); } } offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); @@ -136,23 +136,22 @@ namespace orc { } #endif // MINGW - void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor, - std::string* model_name) { + void OsRetrieveCpuInfo(int64_t* hardwareFlags, CpuInfo::Vendor* vendor, + std::string* modelName) { int register_EAX_id = 1; int highest_valid_id = 0; int highest_extended_valid_id = 0; std::bitset<32> features_ECX; - std::array cpu_info; + std::array cpuInfo; // Get highest valid id - __cpuid(cpu_info.data(), 0); - highest_valid_id = cpu_info[0]; + __cpuid(cpuInfo.data(), 0); + highest_valid_id = cpuInfo[0]; // HEX of "GenuineIntel": 47656E75 696E6549 6E74656C // HEX of "AuthenticAMD": 41757468 656E7469 63414D44 - if (cpu_info[1] == 0x756e6547 && cpu_info[3] == 0x49656e69 && cpu_info[2] == 0x6c65746e) { + if (cpuInfo[1] == 0x756e6547 && cpuInfo[3] == 0x49656e69 && cpuInfo[2] == 0x6c65746e) { *vendor = CpuInfo::Vendor::Intel; - } else if (cpu_info[1] == 0x68747541 && cpu_info[3] == 0x69746e65 && - cpu_info[2] == 0x444d4163) { + } else if (cpuInfo[1] == 0x68747541 && cpuInfo[3] == 0x69746e65 && cpuInfo[2] == 0x444d4163) { *vendor = CpuInfo::Vendor::AMD; } @@ -161,19 +160,19 @@ namespace orc { } // EAX=1: Processor Info and Feature Bits - __cpuidex(cpu_info.data(), register_EAX_id, 0); - features_ECX = cpu_info[2]; + __cpuidex(cpuInfo.data(), register_EAX_id, 0); + features_ECX = cpuInfo[2]; // Get highest extended id - __cpuid(cpu_info.data(), 0x80000000); - highest_extended_valid_id = cpu_info[0]; + __cpuid(cpuInfo.data(), 0x80000000); + highest_extended_valid_id = cpuInfo[0]; // Retrieve CPU model name if (highest_extended_valid_id >= static_cast(0x80000004)) { - model_name->clear(); + modelName->clear(); for (int i = 0x80000002; i <= static_cast(0x80000004); ++i) { - __cpuidex(cpu_info.data(), i, 0); - *model_name += std::string(reinterpret_cast(cpu_info.data()), sizeof(cpu_info)); + __cpuidex(cpuInfo.data(), i, 0); + *modelName += std::string(reinterpret_cast(cpuInfo.data()), sizeof(cpuInfo)); } } @@ -184,37 +183,37 @@ namespace orc { zmm_enabled = (xcr0 & 0xE0) == 0xE0; } - if (features_ECX[9]) *hardware_flags |= CpuInfo::SSSE3; - if (features_ECX[19]) *hardware_flags |= CpuInfo::SSE4_1; - if (features_ECX[20]) *hardware_flags |= CpuInfo::SSE4_2; - if (features_ECX[23]) *hardware_flags |= CpuInfo::POPCNT; - if (features_ECX[28]) *hardware_flags |= CpuInfo::AVX; + if (features_ECX[9]) *hardwareFlags |= CpuInfo::SSSE3; + if (features_ECX[19]) *hardwareFlags |= CpuInfo::SSE4_1; + if (features_ECX[20]) *hardwareFlags |= CpuInfo::SSE4_2; + if (features_ECX[23]) *hardwareFlags |= CpuInfo::POPCNT; + if (features_ECX[28]) *hardwareFlags |= CpuInfo::AVX; // cpuid with EAX=7, ECX=0: Extended Features register_EAX_id = 7; if (highest_valid_id > register_EAX_id) { - __cpuidex(cpu_info.data(), register_EAX_id, 0); - std::bitset<32> features_EBX = cpu_info[1]; + __cpuidex(cpuInfo.data(), register_EAX_id, 0); + std::bitset<32> features_EBX = cpuInfo[1]; - if (features_EBX[3]) *hardware_flags |= CpuInfo::BMI1; - if (features_EBX[5]) *hardware_flags |= CpuInfo::AVX2; - if (features_EBX[8]) *hardware_flags |= CpuInfo::BMI2; + if (features_EBX[3]) *hardwareFlags |= CpuInfo::BMI1; + if (features_EBX[5]) *hardwareFlags |= CpuInfo::AVX2; + if (features_EBX[8]) *hardwareFlags |= CpuInfo::BMI2; if (zmm_enabled) { - if (features_EBX[16]) *hardware_flags |= CpuInfo::AVX512F; - if (features_EBX[17]) *hardware_flags |= CpuInfo::AVX512DQ; - if (features_EBX[28]) *hardware_flags |= CpuInfo::AVX512CD; - if (features_EBX[30]) *hardware_flags |= CpuInfo::AVX512BW; - if (features_EBX[31]) *hardware_flags |= CpuInfo::AVX512VL; + if (features_EBX[16]) *hardwareFlags |= CpuInfo::AVX512F; + if (features_EBX[17]) *hardwareFlags |= CpuInfo::AVX512DQ; + if (features_EBX[28]) *hardwareFlags |= CpuInfo::AVX512CD; + if (features_EBX[30]) *hardwareFlags |= CpuInfo::AVX512BW; + if (features_EBX[31]) *hardwareFlags |= CpuInfo::AVX512VL; } } } #elif defined(CPUINFO_ARCH_ARM) // Windows on Arm - void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor, - std::string* model_name) { - *hardware_flags |= CpuInfo::ASIMD; - // TODO: vendor, model_name + void OsRetrieveCpuInfo(int64_t* hardwareFlags, CpuInfo::Vendor* vendor, + std::string* modelName) { + *hardwareFlags |= CpuInfo::ASIMD; + // TODO: vendor, modelName } #endif @@ -236,25 +235,25 @@ namespace orc { return std::nullopt; } - void OsRetrieveCacheSize(std::array* cache_sizes) { + void OsRetrieveCacheSize(std::array* cacheSizes) { static_assert(kCacheLevels >= 3, ""); auto c = IntegerSysCtlByName("hw.l1dcachesize"); if (c.has_value()) { - (*cache_sizes)[0] = *c; + (*cacheSizes)[0] = *c; } c = IntegerSysCtlByName("hw.l2cachesize"); if (c.has_value()) { - (*cache_sizes)[1] = *c; + (*cacheSizes)[1] = *c; } c = IntegerSysCtlByName("hw.l3cachesize"); if (c.has_value()) { - (*cache_sizes)[2] = *c; + (*cacheSizes)[2] = *c; } } - void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor, - std::string* model_name) { - // hardware_flags + void OsRetrieveCpuInfo(int64_t* hardwareFlags, CpuInfo::Vendor* vendor, + std::string* modelName) { + // hardwareFlags struct SysCtlCpuFeature { const char* name; int64_t flag; @@ -280,13 +279,13 @@ namespace orc { for (const auto& feature : features) { auto v = IntegerSysCtlByName(feature.name); if (v.value_or(0)) { - *hardware_flags |= feature.flag; + *hardwareFlags |= feature.flag; } } - // TODO: vendor, model_name + // TODO: vendor, modelName *vendor = CpuInfo::Vendor::Unknown; - *model_name = "Unknown"; + *modelName = "Unknown"; } #else @@ -345,7 +344,7 @@ namespace orc { const struct { std::string name; int64_t flag; - } flag_mappings[] = { + } flagMappings[] = { #if defined(CPUINFO_ARCH_X86) {"ssse3", CpuInfo::SSSE3}, {"sse4_1", CpuInfo::SSE4_1}, @@ -364,12 +363,12 @@ namespace orc { {"asimd", CpuInfo::ASIMD}, #endif }; - const int64_t num_flags = sizeof(flag_mappings) / sizeof(flag_mappings[0]); + const int64_t num_flags = sizeof(flagMappings) / sizeof(flagMappings[0]); int64_t flags = 0; for (int i = 0; i < num_flags; ++i) { - if (values.find(flag_mappings[i].name) != std::string::npos) { - flags |= flag_mappings[i].flag; + if (values.find(flagMappings[i].name) != std::string::npos) { + flags |= flagMappings[i].flag; } } return flags; @@ -469,9 +468,9 @@ namespace orc { #elif defined(CPUINFO_ARCH_ARM) //------------------------------ AARCH64 ------------------------------// - bool ArchParseUserSimdLevel(const std::string& simd_level, int64_t* hardware_flags) { - if (simd_level == "NONE") { - *hardware_flags &= ~CpuInfo::ASIMD; + bool ArchParseUserSimdLevel(const std::string& simdLevel, int64_t* hardwareFlags) { + if (simdLevel == "NONE") { + *hardwareFlags &= ~CpuInfo::ASIMD; return true; } return false; @@ -485,7 +484,7 @@ namespace orc { #else //------------------------------ PPC, ... ------------------------------// - bool ArchParseUserSimdLevel(const std::string& simd_level, int64_t* hardware_flags) { + bool ArchParseUserSimdLevel(const std::string& simdLevel, int64_t* hardwareFlags) { return true; } @@ -496,17 +495,17 @@ namespace orc { } // namespace struct CpuInfo::Impl { - int64_t hardware_flags = 0; + int64_t hardwareFlags = 0; int numCores = 0; - int64_t original_hardware_flags = 0; + int64_t originalHardwareFlags = 0; Vendor vendor = Vendor::Unknown; - std::string model_name = "Unknown"; - std::array cache_sizes{}; + std::string modelName = "Unknown"; + std::array cacheSizes{}; Impl() { - OsRetrieveCacheSize(&cache_sizes); - OsRetrieveCpuInfo(&hardware_flags, &vendor, &model_name); - original_hardware_flags = hardware_flags; + OsRetrieveCacheSize(&cacheSizes); + OsRetrieveCpuInfo(&hardwareFlags, &vendor, &modelName); + originalHardwareFlags = hardwareFlags; numCores = std::max(static_cast(std::thread::hardware_concurrency()), 1); // parse user simd level @@ -514,7 +513,7 @@ namespace orc { std::string userSimdLevel = maybe_env_var == nullptr ? "NONE" : std::string(maybe_env_var); std::transform(userSimdLevel.begin(), userSimdLevel.end(), userSimdLevel.begin(), [](unsigned char c) { return std::toupper(c); }); - if (!ArchParseUserSimdLevel(userSimdLevel, &hardware_flags)) { + if (!ArchParseUserSimdLevel(userSimdLevel, &hardwareFlags)) { throw ParseError("Invalid value for ORC_USER_SIMD_LEVEL: " + userSimdLevel); } } @@ -530,8 +529,8 @@ namespace orc { #endif const CpuInfo* CpuInfo::getInstance() { - static CpuInfo cpu_info; - return &cpu_info; + static CpuInfo cpuInfo; + return &cpuInfo; } #ifdef __clang__ @@ -539,7 +538,7 @@ namespace orc { #endif int64_t CpuInfo::hardwareFlags() const { - return impl_->hardware_flags; + return impl_->hardwareFlags; } int CpuInfo::numCores() const { @@ -551,7 +550,7 @@ namespace orc { } const std::string& CpuInfo::modelName() const { - return impl_->model_name; + return impl_->modelName; } int64_t CpuInfo::cacheSize(CacheLevel level) const { @@ -564,18 +563,18 @@ namespace orc { static_assert(static_cast(CacheLevel::L1) == 0, ""); const int i = static_cast(level); - if (impl_->cache_sizes[i] > 0) return impl_->cache_sizes[i]; + if (impl_->cacheSizes[i] > 0) return impl_->cacheSizes[i]; if (i == 0) return kDefaultCacheSizes[0]; // l3 may be not available, return maximum of l2 or default size - return std::max(kDefaultCacheSizes[i], impl_->cache_sizes[i - 1]); + return std::max(kDefaultCacheSizes[i], impl_->cacheSizes[i - 1]); } bool CpuInfo::isSupported(int64_t flags) const { - return (impl_->hardware_flags & flags) == flags; + return (impl_->hardwareFlags & flags) == flags; } bool CpuInfo::isDetected(int64_t flags) const { - return (impl_->original_hardware_flags & flags) == flags; + return (impl_->originalHardwareFlags & flags) == flags; } void CpuInfo::verifyCpuRequirements() const { diff --git a/c++/src/Exceptions.cc b/c++/src/Exceptions.cc index 30ecf7dc7c..2ba1ab404c 100644 --- a/c++/src/Exceptions.cc +++ b/c++/src/Exceptions.cc @@ -84,4 +84,20 @@ namespace orc { SchemaEvolutionError::~SchemaEvolutionError() noexcept { // PASS } + + CompressionError::CompressionError(const std::string& whatArg) : runtime_error(whatArg) { + // PASS + } + + CompressionError::CompressionError(const char* whatArg) : runtime_error(whatArg) { + // PASS + } + + CompressionError::CompressionError(const CompressionError& error) : runtime_error(error) { + // PASS + } + + CompressionError::~CompressionError() noexcept { + // PASS + } } // namespace orc diff --git a/c++/src/Geospatial.cc b/c++/src/Geospatial.cc new file mode 100644 index 0000000000..6d7d268703 --- /dev/null +++ b/c++/src/Geospatial.cc @@ -0,0 +1,307 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This file contains code adapted from the Apache Arrow project. + * + * Original source: + * https://github.com/apache/arrow/blob/main/cpp/src/parquet/geospatial/statistics.cc + * + * The original code is licensed under the Apache License, Version 2.0. + * + * Modifications may have been made from the original source. + */ + +#include "orc/Geospatial.hh" +#include "orc/Exceptions.hh" + +#include "Geospatial.hh" + +#include +#include +#include +#include + +namespace orc::geospatial { + + template + inline std::enable_if_t, T> safeLoadAs(const uint8_t* unaligned) { + std::remove_const_t ret; + std::memcpy(&ret, unaligned, sizeof(T)); + return ret; + } + + template + inline std::enable_if_t && std::is_trivially_copyable_v && + sizeof(T) == sizeof(U), + U> + safeCopy(T value) { + std::remove_const_t ret; + std::memcpy(&ret, static_cast(&value), sizeof(T)); + return ret; + } + + static bool isLittleEndian() { + static union { + uint32_t i; + char c[4]; + } num = {0x01020304}; + return num.c[0] == 4; + } + +#if defined(_MSC_VER) +#include // IWYU pragma: keep +#define ORC_BYTE_SWAP64 _byteSwap_uint64 +#define ORC_BYTE_SWAP32 _byteSwap_ulong +#else +#define ORC_BYTE_SWAP64 __builtin_bswap64 +#define ORC_BYTE_SWAP32 __builtin_bswap32 +#endif + + // Swap the byte order (i.e. endianness) + static inline uint32_t byteSwap(uint32_t value) { + return static_cast(ORC_BYTE_SWAP32(value)); + } + static inline double byteSwap(double value) { + const uint64_t swapped = ORC_BYTE_SWAP64(safeCopy(value)); + return safeCopy(swapped); + } + + std::string BoundingBox::toString() const { + std::stringstream ss; + ss << "BoundingBox{xMin=" << min[0] << ", xMax=" << max[0] << ", yMin=" << min[1] + << ", yMax=" << max[1] << ", zMin=" << min[2] << ", zMax=" << max[2] << ", mMin=" << min[3] + << ", mMax=" << max[3] << "}"; + return ss.str(); + } + + /// \brief Object to keep track of the low-level consumption of a well-known binary + /// geometry + /// + /// Briefly, ISO well-known binary supported by the Parquet spec is an endian byte + /// (0x01 or 0x00), followed by geometry type + dimensions encoded as a (uint32_t), + /// followed by geometry-specific data. Coordinate sequences are represented by a + /// uint32_t (the number of coordinates) plus a sequence of doubles (number of coordinates + /// multiplied by the number of dimensions). + class WKBBuffer { + public: + WKBBuffer() : data_(nullptr), size_(0) {} + WKBBuffer(const uint8_t* data, int64_t size) : data_(data), size_(size) {} + + uint8_t readUInt8() { + return readChecked(); + } + + uint32_t readUInt32(bool swap) { + auto value = readChecked(); + return swap ? byteSwap(value) : value; + } + + template + void readCoords(uint32_t nCoords, bool swap, Visit&& visit) { + size_t total_bytes = nCoords * sizeof(Coord); + if (size_ < total_bytes) { + } + + if (swap) { + Coord coord; + for (uint32_t i = 0; i < nCoords; i++) { + coord = readUnchecked(); + for (auto& c : coord) { + c = byteSwap(c); + } + + std::forward(visit)(coord); + } + } else { + for (uint32_t i = 0; i < nCoords; i++) { + std::forward(visit)(readUnchecked()); + } + } + } + + size_t size() const { + return size_; + } + + private: + const uint8_t* data_; + size_t size_; + + template + T readChecked() { + if (size_ < sizeof(T)) { + std::stringstream ss; + ss << "Can't read" << sizeof(T) << " bytes from WKBBuffer with " << size_ << " remaining"; + throw ParseError(ss.str()); + } + + return readUnchecked(); + } + + template + T readUnchecked() { + T out = safeLoadAs(data_); + data_ += sizeof(T); + size_ -= sizeof(T); + return out; + } + }; + + using GeometryTypeAndDimensions = std::pair; + + namespace { + + std::optional parseGeometryType(uint32_t wkbGeometryType) { + // The number 1000 can be used because WKB geometry types are constructed + // on purpose such that this relationship is true (e.g., LINESTRING ZM maps + // to 3002). + uint32_t geometryTypeComponent = wkbGeometryType % 1000; + uint32_t dimensionsComponent = wkbGeometryType / 1000; + + auto minGeometryTypeValue = static_cast(GeometryType::VALUE_MIN); + auto maxGeometryTypeValue = static_cast(GeometryType::VALUE_MAX); + auto minDimensionValue = static_cast(Dimensions::VALUE_MIN); + auto maxDimensionValue = static_cast(Dimensions::VALUE_MAX); + + if (geometryTypeComponent < minGeometryTypeValue || + geometryTypeComponent > maxGeometryTypeValue || dimensionsComponent < minDimensionValue || + dimensionsComponent > maxDimensionValue) { + return std::nullopt; + } + + return std::make_optional( + GeometryTypeAndDimensions{static_cast(geometryTypeComponent), + static_cast(dimensionsComponent)}); + } + + } // namespace + + std::vector WKBGeometryBounder::geometryTypes() const { + std::vector out(geospatialTypes_.begin(), geospatialTypes_.end()); + std::sort(out.begin(), out.end()); + return out; + } + + void WKBGeometryBounder::mergeGeometry(std::string_view bytesWkb) { + if (!isValid_) { + return; + } + mergeGeometry(reinterpret_cast(bytesWkb.data()), bytesWkb.size()); + } + + void WKBGeometryBounder::mergeGeometry(const uint8_t* bytesWkb, size_t bytesSize) { + if (!isValid_) { + return; + } + WKBBuffer src{bytesWkb, static_cast(bytesSize)}; + try { + mergeGeometryInternal(&src, /*record_wkb_type=*/true); + } catch (const ParseError&) { + invalidate(); + return; + } + if (src.size() != 0) { + // "Exepcted zero bytes after consuming WKB + invalidate(); + } + } + + void WKBGeometryBounder::mergeGeometryInternal(WKBBuffer* src, bool recordWkbType) { + uint8_t endian = src->readUInt8(); + bool swap = endian != 0x00; + if (isLittleEndian()) { + swap = endian != 0x01; + } + + uint32_t wkbGeometryType = src->readUInt32(swap); + auto geometryTypeAndDimensions = parseGeometryType(wkbGeometryType); + if (!geometryTypeAndDimensions.has_value()) { + invalidate(); + return; + } + auto& [geometry_type, dimensions] = geometryTypeAndDimensions.value(); + + // Keep track of geometry types encountered if at the top level + if (recordWkbType) { + geospatialTypes_.insert(static_cast(wkbGeometryType)); + } + + switch (geometry_type) { + case GeometryType::POINT: + mergeSequence(src, dimensions, 1, swap); + break; + + case GeometryType::LINESTRING: { + uint32_t nCoords = src->readUInt32(swap); + mergeSequence(src, dimensions, nCoords, swap); + break; + } + case GeometryType::POLYGON: { + uint32_t n_parts = src->readUInt32(swap); + for (uint32_t i = 0; i < n_parts; i++) { + uint32_t nCoords = src->readUInt32(swap); + mergeSequence(src, dimensions, nCoords, swap); + } + break; + } + + // These are all encoded the same in WKB, even though this encoding would + // allow for parts to be of a different geometry type or different dimensions. + // For the purposes of bounding, this does not cause us problems. We pass + // record_wkb_type = false because we do not want the child geometry to be + // added to the geometry_types list (e.g., for a MultiPoint, we only want + // the code for MultiPoint to be added, not the code for Point). + case GeometryType::MULTIPOINT: + case GeometryType::MULTILINESTRING: + case GeometryType::MULTIPOLYGON: + case GeometryType::GEOMETRYCOLLECTION: { + uint32_t n_parts = src->readUInt32(swap); + for (uint32_t i = 0; i < n_parts; i++) { + mergeGeometryInternal(src, /*record_wkb_type*/ false); + } + break; + } + } + } + + void WKBGeometryBounder::mergeSequence(WKBBuffer* src, Dimensions dimensions, uint32_t nCoords, + bool swap) { + switch (dimensions) { + case Dimensions::XY: + src->readCoords(nCoords, swap, + [&](BoundingBox::XY coord) { box_.updateXY(coord); }); + break; + case Dimensions::XYZ: + src->readCoords(nCoords, swap, + [&](BoundingBox::XYZ coord) { box_.updateXYZ(coord); }); + break; + case Dimensions::XYM: + src->readCoords(nCoords, swap, + [&](BoundingBox::XYM coord) { box_.updateXYM(coord); }); + break; + case Dimensions::XYZM: + src->readCoords( + nCoords, swap, [&](BoundingBox::XYZM coord) { box_.updateXYZM(coord); }); + break; + default: + invalidate(); + } + } + +} // namespace orc::geospatial diff --git a/c++/src/Geospatial.hh b/c++/src/Geospatial.hh new file mode 100644 index 0000000000..aebb72747a --- /dev/null +++ b/c++/src/Geospatial.hh @@ -0,0 +1,86 @@ + +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_GEOSPATIAL_IMPL_HH +#define ORC_GEOSPATIAL_IMPL_HH + +#include "orc/Geospatial.hh" + +#include +#include + +namespace orc { + namespace geospatial { + class WKBBuffer; + + class WKBGeometryBounder { + public: + void mergeGeometry(std::string_view bytesWkb); + void mergeGeometry(const uint8_t* bytesWkb, size_t bytesSize); + + void mergeBox(const BoundingBox& box) { + box_.merge(box); + } + void mergeGeometryTypes(const std::vector& geospatialTypes) { + geospatialTypes_.insert(geospatialTypes.begin(), geospatialTypes.end()); + } + void merge(const WKBGeometryBounder& other) { + if (!isValid() || !other.isValid()) { + invalidate(); + return; + } + box_.merge(other.box_); + geospatialTypes_.insert(other.geospatialTypes_.begin(), other.geospatialTypes_.end()); + } + + // Get the bounding box for the merged geometries. + const BoundingBox& bounds() const { + return box_; + } + + // Get the set of geometry types encountered during merging. + // Returns a sorted vector of geometry type IDs. + std::vector geometryTypes() const; + + void reset() { + isValid_ = true; + box_.reset(); + geospatialTypes_.clear(); + } + bool isValid() const { + return isValid_; + } + void invalidate() { + isValid_ = false; + box_.invalidate(); + geospatialTypes_.clear(); + } + + private: + BoundingBox box_; + std::unordered_set geospatialTypes_; + bool isValid_ = true; + + void mergeGeometryInternal(WKBBuffer* src, bool recordWkbType); + void mergeSequence(WKBBuffer* src, Dimensions dimensions, uint32_t nCoords, bool swap); + }; + } // namespace geospatial +} // namespace orc + +#endif diff --git a/c++/src/Int128.cc b/c++/src/Int128.cc index 4a1d0b763a..0d4da78b5a 100644 --- a/c++/src/Int128.cc +++ b/c++/src/Int128.cc @@ -25,9 +25,44 @@ #include namespace orc { + NO_SANITIZE_ATTR + Int128& Int128::operator<<=(uint32_t bits) { + if (bits != 0) { + if (bits < 64) { + highbits_ <<= bits; + highbits_ |= (lowbits_ >> (64 - bits)); + lowbits_ <<= bits; + } else if (bits < 128) { + highbits_ = static_cast(lowbits_) << (bits - 64); + lowbits_ = 0; + } else { + highbits_ = 0; + lowbits_ = 0; + } + } + return *this; + } + + NO_SANITIZE_ATTR + Int128& Int128::operator>>=(uint32_t bits) { + if (bits != 0) { + if (bits < 64) { + lowbits_ >>= bits; + lowbits_ |= static_cast(highbits_ << (64 - bits)); + highbits_ = static_cast(static_cast(highbits_) >> bits); + } else if (bits < 128) { + lowbits_ = static_cast(highbits_ >> (bits - 64)); + highbits_ = highbits_ >= 0 ? 0 : -1l; + } else { + highbits_ = highbits_ >= 0 ? 0 : -1l; + lowbits_ = static_cast(highbits_); + } + } + return *this; + } Int128 Int128::maximumValue() { - return Int128(0x7fffffffffffffff, 0xfffffffffffffff); + return Int128(0x7fffffffffffffff, 0xffffffffffffffff); } Int128 Int128::minimumValue() { diff --git a/c++/src/LzoDecompressor.cc b/c++/src/LzoDecompressor.cc index f494f4b651..68e25425c2 100644 --- a/c++/src/LzoDecompressor.cc +++ b/c++/src/LzoDecompressor.cc @@ -342,7 +342,7 @@ namespace orc { char* literalOutputLimit = output + literalLength; if (literalOutputLimit > fastOutputLimit || input + literalLength > inputLimit - SIZE_OF_LONG) { - if (literalOutputLimit > outputLimit) { + if (literalOutputLimit > outputLimit || input + literalLength > inputLimit) { throw MalformedInputException(input - inputAddress); } diff --git a/c++/src/Options.hh b/c++/src/Options.hh index daf9d52e1c..0a4bd56d8f 100644 --- a/c++/src/Options.hh +++ b/c++/src/Options.hh @@ -23,6 +23,8 @@ #include "orc/OrcFile.hh" #include "orc/Reader.hh" +#include "io/Cache.hh" + #include namespace orc { @@ -43,6 +45,7 @@ namespace orc { MemoryPool* memoryPool; std::string serializedTail; ReaderMetrics* metrics; + CacheOptions cacheOptions; ReaderOptionsPrivate() { tailLocation = std::numeric_limits::max(); @@ -122,6 +125,15 @@ namespace orc { return privateBits_->errorStream; } + ReaderOptions& ReaderOptions::setCacheOptions(const CacheOptions& cacheOptions) { + privateBits_->cacheOptions = cacheOptions; + return *this; + } + + const CacheOptions& ReaderOptions::getCacheOptions() const { + return privateBits_->cacheOptions; + } + /** * RowReaderOptions Implementation */ diff --git a/c++/src/OrcFile.cc b/c++/src/OrcFile.cc index 8899299d3d..be86724329 100644 --- a/c++/src/OrcFile.cc +++ b/c++/src/OrcFile.cc @@ -79,7 +79,7 @@ namespace orc { } void read(void* buf, uint64_t length, uint64_t offset) override { - SCOPED_STOPWATCH(metrics, IOBlockingLatencyUs, IOCount); + SCOPED_STOPWATCH(metrics_, IOBlockingLatencyUs, IOCount); if (!buf) { throw ParseError("Buffer is null"); } diff --git a/c++/src/OrcHdfsFile.cc b/c++/src/OrcHdfsFile.cc index 09ff71a0e9..d878e276cb 100644 --- a/c++/src/OrcHdfsFile.cc +++ b/c++/src/OrcHdfsFile.cc @@ -42,23 +42,23 @@ namespace orc { class HdfsFileInputStream : public InputStream { private: - std::string filename; - std::unique_ptr file; - std::unique_ptr file_system; - uint64_t totalLength; - const uint64_t READ_SIZE = 1024 * 1024; // 1 MB - ReaderMetrics* metrics; + std::string filename_; + std::unique_ptr file_; + std::unique_ptr fileSystem_; + uint64_t totalLength_; + const uint64_t readSize_ = 1024 * 1024; // 1 MB + ReaderMetrics* metrics_; public: - HdfsFileInputStream(std::string _filename, ReaderMetrics* _metrics) : metrics(_metrics) { - filename = _filename; + HdfsFileInputStream(std::string filename, ReaderMetrics* metrics) : metrics_(metrics) { + filename_ = filename; // Building a URI object from the given uri_path hdfs::URI uri; try { - uri = hdfs::URI::parse_from_string(filename); + uri = hdfs::URI::parse_from_string(filename_); } catch (const hdfs::uri_parse_error&) { - throw ParseError("Malformed URI: " + filename); + throw ParseError("Malformed URI: " + filename_); } // This sets conf path to default "$HADOOP_CONF_DIR" or "/etc/hadoop/conf" @@ -82,9 +82,9 @@ namespace orc { } hdfs::IoService* io_service = hdfs::IoService::New(); // Wrapping file_system into a unique pointer to guarantee deletion - file_system = + fileSystem_ = std::unique_ptr(hdfs::FileSystem::New(io_service, "", options)); - if (file_system.get() == nullptr) { + if (fileSystem_.get() == nullptr) { throw ParseError("Can't create FileSystem object. "); } hdfs::Status status; @@ -92,13 +92,13 @@ namespace orc { if (!uri.get_host().empty()) { // Using port if supplied, otherwise using "" to look up port in configs std::string port = uri.has_port() ? std::to_string(uri.get_port()) : ""; - status = file_system->Connect(uri.get_host(), port); + status = fileSystem_->Connect(uri.get_host(), port); if (!status.ok()) { throw ParseError("Can't connect to " + uri.get_host() + ":" + port + ". " + status.ToString()); } } else { - status = file_system->ConnectToDefaultFs(); + status = fileSystem_->ConnectToDefaultFs(); if (!status.ok()) { if (!options.defaultFS.get_host().empty()) { throw ParseError("Error connecting to " + options.defaultFS.str() + ". " + @@ -110,32 +110,32 @@ namespace orc { } } - if (file_system.get() == nullptr) { + if (fileSystem_.get() == nullptr) { throw ParseError("Can't connect the file system. "); } hdfs::FileHandle* file_raw = nullptr; - status = file_system->Open(uri.get_path(true), &file_raw); + status = fileSystem_->Open(uri.get_path(true), &file_raw); if (!status.ok()) { throw ParseError("Can't open " + uri.get_path(true) + ". " + status.ToString()); } // Wrapping file_raw into a unique pointer to guarantee deletion - file.reset(file_raw); + file_.reset(file_raw); hdfs::StatInfo stat_info; - status = file_system->GetFileInfo(uri.get_path(true), stat_info); + status = fileSystem_->GetFileInfo(uri.get_path(true), stat_info); if (!status.ok()) { throw ParseError("Can't stat " + uri.get_path(true) + ". " + status.ToString()); } - totalLength = stat_info.length; + totalLength_ = stat_info.length; } uint64_t getLength() const override { - return totalLength; + return totalLength_; } uint64_t getNaturalReadSize() const override { - return READ_SIZE; + return readSize_; } void read(void* buf, uint64_t length, uint64_t offset) override { @@ -151,8 +151,8 @@ namespace orc { do { status = - file->PositionRead(buf_ptr, static_cast(length) - total_bytes_read, - static_cast(offset + total_bytes_read), &last_bytes_read); + file_->PositionRead(buf_ptr, static_cast(length) - total_bytes_read, + static_cast(offset + total_bytes_read), &last_bytes_read); if (!status.ok()) { throw ParseError("Error reading the file: " + status.ToString()); } @@ -162,7 +162,7 @@ namespace orc { } const std::string& getName() const override { - return filename; + return filename_; } ~HdfsFileInputStream() override; diff --git a/c++/src/RLE.cc b/c++/src/RLE.cc index 89aca6a10e..19ca558fc6 100644 --- a/c++/src/RLE.cc +++ b/c++/src/RLE.cc @@ -77,6 +77,7 @@ namespace orc { add(data, numValues, notNull); } + NO_SANITIZE_ATTR void RleEncoder::writeVslong(int64_t val) { writeVulong((val << 1) ^ (val >> 63)); } @@ -108,15 +109,23 @@ namespace orc { void RleEncoder::recordPosition(PositionRecorder* recorder) const { uint64_t flushedSize = outputStream->getSize(); - uint64_t unflushedSize = static_cast(bufferPosition); + uint64_t unusedBufferSize = static_cast(bufferLength - bufferPosition); if (outputStream->isCompressed()) { recorder->add(flushedSize); - recorder->add(unflushedSize); + // There are multiple blocks in the input buffer, but bufferPosition only records the + // effective length of the last block. We need rawInputBufferSize to record the total length + // of all variable blocks. + recorder->add(outputStream->getRawInputBufferSize() - unusedBufferSize); } else { - flushedSize -= static_cast(bufferLength); - recorder->add(flushedSize + unflushedSize); + recorder->add(flushedSize - unusedBufferSize); } recorder->add(static_cast(numLiterals)); } + void RleEncoder::finishEncode() { + outputStream->BackUp(static_cast(bufferLength - bufferPosition)); + outputStream->finishStream(); + bufferLength = bufferPosition = 0; + } + } // namespace orc diff --git a/c++/src/RLE.hh b/c++/src/RLE.hh index a45b4056bc..3ad93e3dc9 100644 --- a/c++/src/RLE.hh +++ b/c++/src/RLE.hh @@ -26,6 +26,7 @@ namespace orc { + NO_SANITIZE_ATTR inline int64_t zigZag(int64_t value) { return (value << 1) ^ (value >> 63); } @@ -84,6 +85,13 @@ namespace orc { virtual void write(int64_t val) = 0; + /** + * Finalize the encoding process. This function should be called after all data required for + * encoding has been added. It ensures that any remaining data is processed and the final state + * of the encoder is set. + */ + virtual void finishEncode(); + protected: std::unique_ptr outputStream; size_t bufferPosition; diff --git a/c++/src/RLEv1.cc b/c++/src/RLEv1.cc index 5d6f600669..72c555e610 100644 --- a/c++/src/RLEv1.cc +++ b/c++/src/RLEv1.cc @@ -74,10 +74,8 @@ namespace orc { } uint64_t RleEncoderV1::flush() { - writeValues(); - outputStream->BackUp(static_cast(bufferLength - bufferPosition)); + finishEncode(); uint64_t dataSize = outputStream->flush(); - bufferLength = bufferPosition = 0; return dataSize; } @@ -135,6 +133,11 @@ namespace orc { } } + void RleEncoderV1::finishEncode() { + writeValues(); + RleEncoder::finishEncode(); + } + signed char RleDecoderV1::readByte() { SCOPED_MINUS_STOPWATCH(metrics, DecodingLatencyUs); if (bufferStart_ == bufferEnd_) { diff --git a/c++/src/RLEv1.hh b/c++/src/RLEv1.hh index a2a00c9305..024b1e5e97 100644 --- a/c++/src/RLEv1.hh +++ b/c++/src/RLEv1.hh @@ -38,6 +38,8 @@ namespace orc { void write(int64_t val) override; + void finishEncode() override; + private: int64_t delta_; bool repeat_; diff --git a/c++/src/RLEv2.hh b/c++/src/RLEv2.hh index a8e0340e7e..c2ce5aa851 100644 --- a/c++/src/RLEv2.hh +++ b/c++/src/RLEv2.hh @@ -108,6 +108,8 @@ namespace orc { void write(int64_t val) override; + void finishEncode() override; + private: const bool alignedBitPacking_; uint32_t fixedRunLength_; @@ -121,6 +123,7 @@ namespace orc { int64_t* zigzagLiterals_; int64_t* baseRedLiterals_; int64_t* adjDeltas_; + static constexpr int64_t BASE_VALUE_LIMIT = int64_t(1) << 56; uint32_t getOpCode(EncodingType encoding); int64_t* prepareForDirectOrPatchedBase(EncodingOption& option); diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc index 8a43818a53..349ae1b407 100644 --- a/c++/src/Reader.cc +++ b/c++/src/Reader.cc @@ -751,27 +751,35 @@ namespace orc { return *(contents_->schema.get()); } - std::unique_ptr ReaderImpl::getStripeStatistics(uint64_t stripeIndex) const { + std::unique_ptr ReaderImpl::getStripeStatistics(uint64_t stripeIndex, + bool includeRowIndex) const { if (!isMetadataLoaded_) { readMetadata(); } if (contents_->metadata == nullptr) { throw std::logic_error("No stripe statistics in file"); } - size_t num_cols = static_cast( - contents_->metadata->stripe_stats(static_cast(stripeIndex)).col_stats_size()); - std::vector> indexStats(num_cols); proto::StripeInformation currentStripeInfo = footer_->stripes(static_cast(stripeIndex)); proto::StripeFooter currentStripeFooter = getStripeFooter(currentStripeInfo, *contents_.get()); - getRowIndexStatistics(currentStripeInfo, stripeIndex, currentStripeFooter, &indexStats); - const Timezone& writerTZ = currentStripeFooter.has_writer_timezone() ? getTimezoneByName(currentStripeFooter.writer_timezone()) : getLocalTimezone(); StatContext statContext(hasCorrectStatistics(), &writerTZ); - return std::make_unique( + + if (!includeRowIndex) { + return std::make_unique( + contents_->metadata->stripe_stats(static_cast(stripeIndex)), statContext); + } + + size_t num_cols = static_cast( + contents_->metadata->stripe_stats(static_cast(stripeIndex)).col_stats_size()); + std::vector> indexStats(num_cols); + + getRowIndexStatistics(currentStripeInfo, stripeIndex, currentStripeFooter, &indexStats); + + return std::make_unique( contents_->metadata->stripe_stats(static_cast(stripeIndex)), indexStats, statContext); } @@ -865,6 +873,8 @@ namespace orc { case proto::Type_Kind_CHAR: case proto::Type_Kind_STRING: case proto::Type_Kind_VARCHAR: + case proto::Type_Kind_GEOMETRY: + case proto::Type_Kind_GEOGRAPHY: return 4; default: return 0; @@ -1117,7 +1127,7 @@ namespace orc { } bool RowReaderImpl::next(ColumnVectorBatch& data) { - SCOPED_STOPWATCH(contents->readerMetrics, ReaderInclusiveLatencyUs, ReaderCall); + SCOPED_STOPWATCH(contents_->readerMetrics, ReaderInclusiveLatencyUs, ReaderCall); if (currentStripe_ >= lastStripe_) { data.numElements = 0; markEndOfFile(); @@ -1426,17 +1436,10 @@ namespace orc { uint32_t stripeIndex, const std::set& included) const { std::map ret; - // find stripe info - if (stripeIndex >= static_cast(footer_->stripes_size())) { - throw std::logic_error("Illegal stripe index: " + - to_string(static_cast(stripeIndex))); - } - const proto::StripeInformation currentStripeInfo = - footer_->stripes(static_cast(stripeIndex)); - const proto::StripeFooter currentStripeFooter = getStripeFooter(currentStripeInfo, *contents_); + uint64_t offset; + auto currentStripeFooter = loadCurrentStripeFooter(stripeIndex, offset); // iterate stripe footer to get stream of bloom_filter - uint64_t offset = static_cast(currentStripeInfo.offset()); for (int i = 0; i < currentStripeFooter.streams_size(); i++) { const proto::Stream& stream = currentStripeFooter.streams(i); uint32_t column = static_cast(stream.column()); @@ -1474,6 +1477,150 @@ namespace orc { return ret; } + proto::StripeFooter ReaderImpl::loadCurrentStripeFooter(uint32_t stripeIndex, + uint64_t& offset) const { + // find stripe info + if (stripeIndex >= static_cast(footer_->stripes_size())) { + throw std::logic_error("Illegal stripe index: " + + to_string(static_cast(stripeIndex))); + } + const proto::StripeInformation currentStripeInfo = + footer_->stripes(static_cast(stripeIndex)); + offset = static_cast(currentStripeInfo.offset()); + return getStripeFooter(currentStripeInfo, *contents_); + } + + std::map ReaderImpl::getRowGroupIndex( + uint32_t stripeIndex, const std::set& included) const { + std::map ret; + uint64_t offset; + auto currentStripeFooter = loadCurrentStripeFooter(stripeIndex, offset); + + // iterate stripe footer to get stream of row_index + for (int i = 0; i < currentStripeFooter.streams_size(); i++) { + const proto::Stream& stream = currentStripeFooter.streams(i); + uint32_t column = static_cast(stream.column()); + uint64_t length = static_cast(stream.length()); + RowGroupIndex& rowGroupIndex = ret[column]; + + if (stream.kind() == proto::Stream_Kind_ROW_INDEX && + (included.empty() || included.find(column) != included.end())) { + std::unique_ptr pbStream = + createDecompressor(contents_->compression, + std::make_unique( + contents_->stream.get(), offset, length, *contents_->pool), + contents_->blockSize, *(contents_->pool), contents_->readerMetrics); + + proto::RowIndex pbRowIndex; + if (!pbRowIndex.ParseFromZeroCopyStream(pbStream.get())) { + std::stringstream errMsgBuffer; + errMsgBuffer << "Failed to parse RowIndex at column " << column << " in stripe " + << stripeIndex; + throw ParseError(errMsgBuffer.str()); + } + + // add rowGroupIndex to result for one column + for (auto& rowIndexEntry : pbRowIndex.entry()) { + std::vector posVector; + for (auto& position : rowIndexEntry.positions()) { + posVector.push_back(position); + } + rowGroupIndex.positions.push_back(posVector); + } + } + offset += length; + } + return ret; + } + + void ReaderImpl::releaseBuffer(uint64_t boundary) { + std::lock_guard lock(contents_->readCacheMutex); + + if (contents_->readCache) { + contents_->readCache->evictEntriesBefore(boundary); + } + } + + void ReaderImpl::preBuffer(const std::vector& stripes, + const std::list& includeTypes) { + std::vector newStripes; + for (auto stripe : stripes) { + if (stripe < static_cast(footer_->stripes_size())) newStripes.push_back(stripe); + } + + std::list newIncludeTypes; + for (auto type : includeTypes) { + if (type < static_cast(footer_->types_size())) newIncludeTypes.push_back(type); + } + + if (newStripes.empty() || newIncludeTypes.empty()) { + return; + } + + orc::RowReaderOptions rowReaderOptions; + rowReaderOptions.includeTypes(newIncludeTypes); + ColumnSelector columnSelector(contents_.get()); + std::vector selectedColumns; + columnSelector.updateSelected(selectedColumns, rowReaderOptions); + + std::vector ranges; + ranges.reserve(newIncludeTypes.size()); + for (auto stripe : newStripes) { + // get stripe information + const auto& stripeInfo = footer_->stripes(stripe); + uint64_t stripeFooterStart = + stripeInfo.offset() + stripeInfo.index_length() + stripeInfo.data_length(); + uint64_t stripeFooterLength = stripeInfo.footer_length(); + + // get stripe footer + std::unique_ptr pbStream = createDecompressor( + contents_->compression, + std::make_unique(contents_->stream.get(), stripeFooterStart, + stripeFooterLength, *contents_->pool), + contents_->blockSize, *contents_->pool, contents_->readerMetrics); + proto::StripeFooter stripeFooter; + if (!stripeFooter.ParseFromZeroCopyStream(pbStream.get())) { + throw ParseError(std::string("bad StripeFooter from ") + pbStream->getName()); + } + + // traverse all streams in stripe footer, choose selected streams to prebuffer + uint64_t offset = stripeInfo.offset(); + for (int i = 0; i < stripeFooter.streams_size(); i++) { + const proto::Stream& stream = stripeFooter.streams(i); + if (offset + stream.length() > stripeFooterStart) { + std::stringstream msg; + msg << "Malformed stream meta at stream index " << i << " in stripe " << stripe + << ": streamOffset=" << offset << ", streamLength=" << stream.length() + << ", stripeOffset=" << stripeInfo.offset() + << ", stripeIndexLength=" << stripeInfo.index_length() + << ", stripeDataLength=" << stripeInfo.data_length(); + throw ParseError(msg.str()); + } + + if (stream.has_kind() && selectedColumns[stream.column()]) { + const auto& kind = stream.kind(); + if (kind == proto::Stream_Kind_DATA || kind == proto::Stream_Kind_DICTIONARY_DATA || + kind == proto::Stream_Kind_PRESENT || kind == proto::Stream_Kind_LENGTH || + kind == proto::Stream_Kind_SECONDARY) { + ranges.emplace_back(offset, stream.length()); + } + } + + offset += stream.length(); + } + + { + std::lock_guard lock(contents_->readCacheMutex); + + if (!contents_->readCache) { + contents_->readCache = std::make_shared( + getStream(), options_.getCacheOptions(), contents_->pool, contents_->readerMetrics); + } + contents_->readCache->cache(std::move(ranges)); + } + } + } + RowReader::~RowReader() { // PASS } diff --git a/c++/src/Reader.hh b/c++/src/Reader.hh index 630d812c38..3d81d26920 100644 --- a/c++/src/Reader.hh +++ b/c++/src/Reader.hh @@ -26,6 +26,8 @@ #include "ColumnReader.hh" #include "RLE.hh" +#include "io/Cache.hh" + #include "SchemaEvolution.hh" #include "TypeImpl.hh" #include "sargs/SargsApplier.hh" @@ -70,6 +72,11 @@ namespace orc { bool isDecimalAsLong; std::unique_ptr metadata; ReaderMetrics* readerMetrics; + + // mutex to protect readCache_ from concurrent access + std::mutex readCacheMutex; + // cached io ranges. only valid when preBuffer is invoked. + std::shared_ptr readCache; }; proto::StripeFooter getStripeFooter(const proto::StripeInformation& info, @@ -245,6 +252,10 @@ namespace orc { const SchemaEvolution* getSchemaEvolution() const { return &schemaEvolution_; } + + std::shared_ptr getReadCache() const { + return contents_->readCache; + } }; class ReaderImpl : public Reader { @@ -260,15 +271,16 @@ namespace orc { // footer proto::Footer* footer_; uint64_t numberOfStripes_; + uint64_t getMemoryUse(int stripeIx, std::vector& selectedColumns); // internal methods void readMetadata() const; void checkOrcVersion(); - void getRowIndexStatistics( - const proto::StripeInformation& stripeInfo, uint64_t stripeIndex, - const proto::StripeFooter& currentStripeFooter, - std::vector >* indexStats) const; + void getRowIndexStatistics(const proto::StripeInformation& stripeInfo, uint64_t stripeIndex, + const proto::StripeFooter& currentStripeFooter, + std::vector>* indexStats) const; + proto::StripeFooter loadCurrentStripeFooter(uint32_t stripeIndex, uint64_t& offset) const; // metadata mutable bool isMetadataLoaded_; @@ -318,7 +330,8 @@ namespace orc { const std::string& getStreamName() const override; - std::unique_ptr getStripeStatistics(uint64_t stripeIndex) const override; + std::unique_ptr getStripeStatistics( + uint64_t stripeIndex, bool includeRowIndex = true) const override; std::unique_ptr createRowReader() const override; @@ -374,6 +387,13 @@ namespace orc { std::map getBloomFilters( uint32_t stripeIndex, const std::set& included) const override; + + void preBuffer(const std::vector& stripes, + const std::list& includeTypes) override; + void releaseBuffer(uint64_t boundary) override; + + std::map getRowGroupIndex( + uint32_t stripeIndex, const std::set& included) const override; }; } // namespace orc diff --git a/c++/src/RleEncoderV2.cc b/c++/src/RleEncoderV2.cc index 18c5200254..91383bb569 100644 --- a/c++/src/RleEncoderV2.cc +++ b/c++/src/RleEncoderV2.cc @@ -423,7 +423,7 @@ namespace orc { // fallback to DIRECT encoding. // The decision to use patched base was based on zigzag values, but the // actual patching is done on base reduced literals. - if ((option.brBits100p - option.brBits95p) != 0) { + if ((option.brBits100p - option.brBits95p) != 0 && std::abs(option.min) < BASE_VALUE_LIMIT) { option.encoding = PATCHED_BASE; preparePatchedBlob(option); return; @@ -440,31 +440,8 @@ namespace orc { } uint64_t RleEncoderV2::flush() { - if (numLiterals != 0) { - EncodingOption option = {}; - if (variableRunLength_ != 0) { - determineEncoding(option); - writeValues(option); - } else if (fixedRunLength_ != 0) { - if (fixedRunLength_ < MIN_REPEAT) { - variableRunLength_ = fixedRunLength_; - fixedRunLength_ = 0; - determineEncoding(option); - writeValues(option); - } else if (fixedRunLength_ >= MIN_REPEAT && fixedRunLength_ <= MAX_SHORT_REPEAT_LENGTH) { - option.encoding = SHORT_REPEAT; - writeValues(option); - } else { - option.encoding = DELTA; - option.isFixedDelta = true; - writeValues(option); - } - } - } - - outputStream->BackUp(static_cast(bufferLength - bufferPosition)); + finishEncode(); uint64_t dataSize = outputStream->flush(); - bufferLength = bufferPosition = 0; return dataSize; } @@ -779,4 +756,30 @@ namespace orc { fixedRunLength_ = 1; variableRunLength_ = 1; } + + void RleEncoderV2::finishEncode() { + if (numLiterals != 0) { + EncodingOption option = {}; + if (variableRunLength_ != 0) { + determineEncoding(option); + writeValues(option); + } else if (fixedRunLength_ != 0) { + if (fixedRunLength_ < MIN_REPEAT) { + variableRunLength_ = fixedRunLength_; + fixedRunLength_ = 0; + determineEncoding(option); + writeValues(option); + } else if (fixedRunLength_ >= MIN_REPEAT && fixedRunLength_ <= MAX_SHORT_REPEAT_LENGTH) { + option.encoding = SHORT_REPEAT; + writeValues(option); + } else { + option.encoding = DELTA; + option.isFixedDelta = true; + writeValues(option); + } + } + } + + RleEncoder::finishEncode(); + } } // namespace orc diff --git a/c++/src/SchemaEvolution.cc b/c++/src/SchemaEvolution.cc index 4099818ff9..442c43c228 100644 --- a/c++/src/SchemaEvolution.cc +++ b/c++/src/SchemaEvolution.cc @@ -18,6 +18,7 @@ #include "SchemaEvolution.hh" #include "orc/Exceptions.hh" +#include "orc/Type.hh" namespace orc { @@ -80,7 +81,7 @@ namespace orc { if (readType.getKind() == fileType.getKind()) { ret.isValid = true; if (fileType.getKind() == CHAR || fileType.getKind() == VARCHAR) { - ret.isValid = readType.getMaximumLength() == fileType.getMaximumLength(); + ret.needConvert = readType.getMaximumLength() != fileType.getMaximumLength(); } else if (fileType.getKind() == DECIMAL) { ret.needConvert = readType.getPrecision() != fileType.getPrecision() || readType.getScale() != fileType.getScale(); @@ -105,11 +106,17 @@ namespace orc { } case STRING: case CHAR: - case VARCHAR: + case VARCHAR: { + ret.isValid = ret.needConvert = isStringVariant(readType) || isNumeric(readType) || + isTimestamp(readType) || isDecimal(readType); + break; + } case TIMESTAMP: case TIMESTAMP_INSTANT: case DATE: - case BINARY: { + case BINARY: + case GEOMETRY: + case GEOGRAPHY: { // Not support break; } @@ -231,6 +238,8 @@ namespace orc { case FLOAT: case DOUBLE: case BINARY: + case GEOMETRY: + case GEOGRAPHY: case TIMESTAMP: case LIST: case MAP: diff --git a/c++/src/Statistics.cc b/c++/src/Statistics.cc index f9581215b3..a86247f107 100644 --- a/c++/src/Statistics.cc +++ b/c++/src/Statistics.cc @@ -44,6 +44,8 @@ namespace orc { return new DateColumnStatisticsImpl(s, statContext); } else if (s.has_binary_statistics()) { return new BinaryColumnStatisticsImpl(s, statContext); + } else if (s.has_geospatial_statistics()) { + return new GeospatialColumnStatisticsImpl(s); } else { return new ColumnStatisticsImpl(s); } @@ -81,11 +83,20 @@ namespace orc { // PASS } - StripeStatisticsImpl::StripeStatisticsImpl( + StripeStatisticsImpl::StripeStatisticsImpl(const proto::StripeStatistics& stripeStats, + const StatContext& statContext) { + columnStats_ = std::make_unique(stripeStats, statContext); + } + + StripeStatisticsWithRowGroupIndexImpl::~StripeStatisticsWithRowGroupIndexImpl() { + // PASS + } + + StripeStatisticsWithRowGroupIndexImpl::StripeStatisticsWithRowGroupIndexImpl( const proto::StripeStatistics& stripeStats, std::vector >& indexStats, - const StatContext& statContext) { - columnStats_ = std::make_unique(stripeStats, statContext); + const StatContext& statContext) + : StripeStatisticsImpl(stripeStats, statContext) { rowIndexStats_.resize(indexStats.size()); for (size_t i = 0; i < rowIndexStats_.size(); i++) { for (size_t j = 0; j < indexStats[i].size(); j++) { @@ -139,6 +150,10 @@ namespace orc { // PASS } + GeospatialColumnStatistics::~GeospatialColumnStatistics() { + // PASS + } + ColumnStatisticsImpl::~ColumnStatisticsImpl() { // PASS } @@ -179,15 +194,19 @@ namespace orc { // PASS } + GeospatialColumnStatisticsImpl::~GeospatialColumnStatisticsImpl() { + // PASS + } + ColumnStatisticsImpl::ColumnStatisticsImpl(const proto::ColumnStatistics& pb) { stats_.setNumberOfValues(pb.number_of_values()); - stats_.setHasNull(pb.has_null()); + stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true); } BinaryColumnStatisticsImpl::BinaryColumnStatisticsImpl(const proto::ColumnStatistics& pb, const StatContext& statContext) { stats_.setNumberOfValues(pb.number_of_values()); - stats_.setHasNull(pb.has_null()); + stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true); if (pb.has_binary_statistics() && statContext.correctStats) { stats_.setHasTotalLength(pb.binary_statistics().has_sum()); stats_.setTotalLength(static_cast(pb.binary_statistics().sum())); @@ -197,7 +216,7 @@ namespace orc { BooleanColumnStatisticsImpl::BooleanColumnStatisticsImpl(const proto::ColumnStatistics& pb, const StatContext& statContext) { stats_.setNumberOfValues(pb.number_of_values()); - stats_.setHasNull(pb.has_null()); + stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true); if (pb.has_bucket_statistics() && statContext.correctStats) { hasCount_ = true; trueCount_ = pb.bucket_statistics().count(0); @@ -210,7 +229,7 @@ namespace orc { DateColumnStatisticsImpl::DateColumnStatisticsImpl(const proto::ColumnStatistics& pb, const StatContext& statContext) { stats_.setNumberOfValues(pb.number_of_values()); - stats_.setHasNull(pb.has_null()); + stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true); if (!pb.has_date_statistics() || !statContext.correctStats) { // hasMinimum_ is false by default; // hasMaximum_ is false by default; @@ -227,7 +246,7 @@ namespace orc { DecimalColumnStatisticsImpl::DecimalColumnStatisticsImpl(const proto::ColumnStatistics& pb, const StatContext& statContext) { stats_.setNumberOfValues(pb.number_of_values()); - stats_.setHasNull(pb.has_null()); + stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true); if (pb.has_decimal_statistics() && statContext.correctStats) { const proto::DecimalStatistics& stats = pb.decimal_statistics(); stats_.setHasMinimum(stats.has_minimum()); @@ -242,7 +261,7 @@ namespace orc { DoubleColumnStatisticsImpl::DoubleColumnStatisticsImpl(const proto::ColumnStatistics& pb) { stats_.setNumberOfValues(pb.number_of_values()); - stats_.setHasNull(pb.has_null()); + stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true); if (!pb.has_double_statistics()) { stats_.setMinimum(0); stats_.setMaximum(0); @@ -261,7 +280,7 @@ namespace orc { IntegerColumnStatisticsImpl::IntegerColumnStatisticsImpl(const proto::ColumnStatistics& pb) { stats_.setNumberOfValues(pb.number_of_values()); - stats_.setHasNull(pb.has_null()); + stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true); if (!pb.has_int_statistics()) { stats_.setMinimum(0); stats_.setMaximum(0); @@ -281,7 +300,7 @@ namespace orc { StringColumnStatisticsImpl::StringColumnStatisticsImpl(const proto::ColumnStatistics& pb, const StatContext& statContext) { stats_.setNumberOfValues(pb.number_of_values()); - stats_.setHasNull(pb.has_null()); + stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true); if (!pb.has_string_statistics() || !statContext.correctStats) { stats_.setTotalLength(0); } else { @@ -299,7 +318,7 @@ namespace orc { TimestampColumnStatisticsImpl::TimestampColumnStatisticsImpl(const proto::ColumnStatistics& pb, const StatContext& statContext) { stats_.setNumberOfValues(pb.number_of_values()); - stats_.setHasNull(pb.has_null()); + stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true); if (!pb.has_timestamp_statistics() || !statContext.correctStats) { stats_.setMinimum(0); stats_.setMaximum(0); @@ -365,7 +384,7 @@ namespace orc { CollectionColumnStatisticsImpl::CollectionColumnStatisticsImpl( const proto::ColumnStatistics& pb) { stats_.setNumberOfValues(pb.number_of_values()); - stats_.setHasNull(pb.has_null()); + stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true); if (!pb.has_collection_statistics()) { stats_.setMinimum(0); stats_.setMaximum(0); @@ -382,6 +401,40 @@ namespace orc { } } + GeospatialColumnStatisticsImpl::GeospatialColumnStatisticsImpl( + const proto::ColumnStatistics& pb) { + reset(); + if (!pb.has_geospatial_statistics()) { + bounder_.invalidate(); + } else { + const proto::GeospatialStatistics& stats = pb.geospatial_statistics(); + geospatial::BoundingBox::XYZM min; + geospatial::BoundingBox::XYZM max; + for (int i = 0; i < geospatial::MAX_DIMENSIONS; i++) { + min[i] = max[i] = std::numeric_limits::quiet_NaN(); + } + if (stats.has_bbox()) { + const auto& protoBBox = stats.bbox(); + min[0] = protoBBox.xmin(); + min[1] = protoBBox.ymin(); + max[0] = protoBBox.xmax(); + max[1] = protoBBox.ymax(); + if (protoBBox.has_zmin() && protoBBox.has_zmax()) { + min[2] = protoBBox.zmin(); + max[2] = protoBBox.zmax(); + } + if (protoBBox.has_mmin() && protoBBox.has_mmax()) { + min[3] = protoBBox.mmin(); + max[3] = protoBBox.mmax(); + } + } + bounder_.mergeBox(geospatial::BoundingBox(min, max)); + std::vector types = {stats.geospatial_types().begin(), + stats.geospatial_types().end()}; + bounder_.mergeGeometryTypes(types); + } + } + std::unique_ptr createColumnStatistics(const Type& type) { switch (static_cast(type.getKind())) { case BOOLEAN: @@ -413,6 +466,9 @@ namespace orc { return std::make_unique(); case DECIMAL: return std::make_unique(); + case GEOGRAPHY: + case GEOMETRY: + return std::make_unique(); default: throw NotImplementedYet("Not supported type: " + type.toString()); } diff --git a/c++/src/Statistics.hh b/c++/src/Statistics.hh index 6f212c15cc..94b1e5d2b2 100644 --- a/c++/src/Statistics.hh +++ b/c++/src/Statistics.hh @@ -24,6 +24,7 @@ #include "orc/OrcFile.hh" #include "orc/Reader.hh" +#include "Geospatial.hh" #include "Timezone.hh" #include "TypeImpl.hh" @@ -1683,6 +1684,127 @@ namespace orc { } }; + class GeospatialColumnStatisticsImpl : public GeospatialColumnStatistics, + public MutableColumnStatistics { + private: + geospatial::WKBGeometryBounder bounder_; + InternalCharStatistics stats_; + + public: + GeospatialColumnStatisticsImpl() { + reset(); + } + explicit GeospatialColumnStatisticsImpl(const proto::ColumnStatistics& stats); + virtual ~GeospatialColumnStatisticsImpl(); + + uint64_t getNumberOfValues() const override { + return stats_.getNumberOfValues(); + } + + void setNumberOfValues(uint64_t value) override { + stats_.setNumberOfValues(value); + } + + void increase(uint64_t count) override { + stats_.setNumberOfValues(stats_.getNumberOfValues() + count); + } + + bool hasNull() const override { + return stats_.hasNull(); + } + + void setHasNull(bool hasNull) override { + stats_.setHasNull(hasNull); + } + + void merge(const MutableColumnStatistics& other) override { + const GeospatialColumnStatisticsImpl& geoStats = + dynamic_cast(other); + stats_.merge(geoStats.stats_); + bounder_.merge(geoStats.bounder_); + } + + void reset() override { + stats_.reset(); + bounder_.reset(); + } + + void update(const char* value, size_t length) override { + bounder_.mergeGeometry(std::string_view(value, length)); + } + + void toProtoBuf(proto::ColumnStatistics& pbStats) const override { + pbStats.set_has_null(stats_.hasNull()); + pbStats.set_number_of_values(stats_.getNumberOfValues()); + + proto::GeospatialStatistics* geoStats = pbStats.mutable_geospatial_statistics(); + const auto& bbox = bounder_.bounds(); + if (bbox.boundValid(0) && bbox.boundValid(1) && !bbox.boundEmpty(0) && !bbox.boundEmpty(1)) { + geoStats->mutable_bbox()->set_xmin(bbox.min[0]); + geoStats->mutable_bbox()->set_xmax(bbox.max[0]); + geoStats->mutable_bbox()->set_ymin(bbox.min[1]); + geoStats->mutable_bbox()->set_ymax(bbox.max[1]); + if (bbox.boundValid(2) && !bbox.boundEmpty(2)) { + geoStats->mutable_bbox()->set_zmin(bbox.min[2]); + geoStats->mutable_bbox()->set_zmax(bbox.max[2]); + } + if (bbox.boundValid(3) && !bbox.boundEmpty(3)) { + geoStats->mutable_bbox()->set_mmin(bbox.min[3]); + geoStats->mutable_bbox()->set_mmax(bbox.max[3]); + } + } + for (auto type : bounder_.geometryTypes()) { + geoStats->add_geospatial_types(type); + } + } + + std::string toString() const override { + if (!bounder_.isValid()) { + return " invalid"; + } + + std::stringstream ss; + ss << ""; + + std::string dim_label("xyzm"); + const auto& bbox = bounder_.bounds(); + auto dim_valid = bbox.dimensionValid(); + auto dim_empty = bbox.dimensionEmpty(); + auto lower = bbox.lowerBound(); + auto upper = bbox.upperBound(); + + for (int i = 0; i < 4; i++) { + ss << " " << dim_label[i] << ": "; + if (!dim_valid[i]) { + ss << "invalid"; + } else if (dim_empty[i]) { + ss << "empty"; + } else { + ss << "[" << lower[i] << ", " << upper[i] << "]"; + } + } + + std::vector maybe_geometry_types = bounder_.geometryTypes(); + ss << " geometry_types: ["; + std::string sep(""); + for (int32_t geometry_type : maybe_geometry_types) { + ss << sep << geometry_type; + sep = ", "; + } + ss << "]"; + + return ss.str(); + } + + const geospatial::BoundingBox& getBoundingBox() const override { + return bounder_.bounds(); + } + + std::vector getGeospatialTypes() const override { + return bounder_.geometryTypes(); + } + }; + ColumnStatistics* convertColumnStatistics(const proto::ColumnStatistics& s, const StatContext& statContext); @@ -1713,7 +1835,6 @@ namespace orc { class StripeStatisticsImpl : public StripeStatistics { private: std::unique_ptr columnStats_; - std::vector > > rowIndexStats_; // DELIBERATELY NOT IMPLEMENTED StripeStatisticsImpl(const StripeStatisticsImpl&); @@ -1721,7 +1842,6 @@ namespace orc { public: StripeStatisticsImpl(const proto::StripeStatistics& stripeStats, - std::vector >& indexStats, const StatContext& statContext); virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId) const override { @@ -1732,13 +1852,38 @@ namespace orc { return columnStats_->getNumberOfColumns(); } + virtual const ColumnStatistics* getRowIndexStatistics(uint32_t, uint32_t) const override { + throw NotImplementedYet("set includeRowIndex true to get row index stats"); + } + + virtual ~StripeStatisticsImpl() override; + + virtual uint32_t getNumberOfRowIndexStats(uint32_t) const override { + throw NotImplementedYet("set includeRowIndex true to get row index stats"); + } + }; + + class StripeStatisticsWithRowGroupIndexImpl : public StripeStatisticsImpl { + private: + std::vector > > rowIndexStats_; + + // DELIBERATELY NOT IMPLEMENTED + StripeStatisticsWithRowGroupIndexImpl(const StripeStatisticsWithRowGroupIndexImpl&); + StripeStatisticsWithRowGroupIndexImpl& operator=(const StripeStatisticsWithRowGroupIndexImpl&); + + public: + StripeStatisticsWithRowGroupIndexImpl( + const proto::StripeStatistics& stripeStats, + std::vector >& indexStats, + const StatContext& statContext); + virtual const ColumnStatistics* getRowIndexStatistics(uint32_t columnId, uint32_t rowIndex) const override { // check id indices are valid return rowIndexStats_[columnId][rowIndex].get(); } - virtual ~StripeStatisticsImpl() override; + virtual ~StripeStatisticsWithRowGroupIndexImpl() override; uint32_t getNumberOfRowIndexStats(uint32_t columnId) const override { return static_cast(rowIndexStats_[columnId].size()); diff --git a/c++/src/StripeStream.cc b/c++/src/StripeStream.cc index f4345c0871..a5609f7629 100644 --- a/c++/src/StripeStream.cc +++ b/c++/src/StripeStream.cc @@ -19,6 +19,7 @@ #include "StripeStream.hh" #include "RLE.hh" #include "Reader.hh" +#include "io/Cache.hh" #include "orc/Exceptions.hh" #include "wrap/coded-stream-wrapper.h" @@ -37,7 +38,8 @@ namespace orc { stripeStart_(stripeStart), input_(input), writerTimezone_(writerTimezone), - readerTimezone_(readerTimezone) { + readerTimezone_(readerTimezone), + readCache_(reader.getReadCache()) { // PASS } @@ -89,7 +91,6 @@ namespace orc { if (stream.has_kind() && stream.kind() == kind && stream.column() == static_cast(columnId)) { uint64_t streamLength = stream.length(); - uint64_t myBlock = shouldStream ? input_.getNaturalReadSize() : streamLength; if (offset + streamLength > dataEnd) { std::stringstream msg; msg << "Malformed stream meta at stream index " << i << " in stripe " << stripeIndex_ @@ -99,9 +100,23 @@ namespace orc { << ", stripeDataLength=" << stripeInfo_.data_length(); throw ParseError(msg.str()); } - return createDecompressor(reader_.getCompression(), - std::make_unique( - &input_, offset, stream.length(), *pool, myBlock), + + BufferSlice slice; + if (readCache_) { + ReadRange range{offset, streamLength}; + slice = readCache_->read(range); + } + + uint64_t myBlock = shouldStream ? input_.getNaturalReadSize() : streamLength; + std::unique_ptr seekableInput; + if (slice.buffer) { + seekableInput = std::make_unique( + slice.buffer->data() + slice.offset, slice.length); + } else { + seekableInput = std::make_unique(&input_, offset, streamLength, + *pool, myBlock); + } + return createDecompressor(reader_.getCompression(), std::move(seekableInput), reader_.getCompressionSize(), *pool, reader_.getFileContents().readerMetrics); } diff --git a/c++/src/StripeStream.hh b/c++/src/StripeStream.hh index ad82d472c2..2d26f8575e 100644 --- a/c++/src/StripeStream.hh +++ b/c++/src/StripeStream.hh @@ -30,6 +30,7 @@ namespace orc { class RowReaderImpl; + class ReadRangeCache; /** * StripeStream Implementation @@ -45,6 +46,7 @@ namespace orc { InputStream& input_; const Timezone& writerTimezone_; const Timezone& readerTimezone_; + std::shared_ptr readCache_; public: StripeStreamsImpl(const RowReaderImpl& reader, uint64_t index, diff --git a/c++/src/Timezone.cc b/c++/src/Timezone.cc index 32276a850d..384f8ea99f 100644 --- a/c++/src/Timezone.cc +++ b/c++/src/Timezone.cc @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -655,25 +656,24 @@ namespace orc { epoch_ = utcEpoch - getVariant(utcEpoch).gmtOffset; } - const char* getTimezoneDirectory() { + std::string getTimezoneDirectory() { const char* dir = getenv("TZDIR"); if (!dir) { - dir = DEFAULT_TZDIR; + // this is present if we're in an activated conda environment + const char* condaPrefix = getenv("CONDA_PREFIX"); + if (condaPrefix) { + std::string condaDir(condaPrefix); + condaDir += "/share/zoneinfo"; + return condaDir; + } else { + dir = DEFAULT_TZDIR; + } } return dir; } - /** - * Get a timezone by absolute filename. - * Results are cached. - */ - const Timezone& getTimezoneByFilename(const std::string& filename) { - // ORC-110 - std::lock_guard timezone_lock(timezone_mutex); - std::map >::iterator itr = timezoneCache.find(filename); - if (itr != timezoneCache.end()) { - return *(itr->second).get(); - } + static std::vector loadTZDB(const std::string& filename) { + std::vector buffer; if (!fileExists(filename.c_str())) { std::stringstream ss; ss << "Time zone file " << filename << " does not exist." @@ -683,12 +683,65 @@ namespace orc { try { std::unique_ptr file = readFile(filename); size_t size = static_cast(file->getLength()); - std::vector buffer(size); + buffer.resize(size); file->read(&buffer[0], size, 0); - timezoneCache[filename] = std::make_shared(filename, buffer); } catch (ParseError& err) { throw TimezoneError(err.what()); } + return buffer; + } + + class LazyTimezone : public Timezone { + private: + std::string filename_; + mutable std::unique_ptr impl_; + mutable std::once_flag initialized_; + + TimezoneImpl* getImpl() const { + std::call_once(initialized_, [&]() { + auto buffer = loadTZDB(filename_); + impl_ = std::make_unique(filename_, std::move(buffer)); + }); + return impl_.get(); + } + + public: + LazyTimezone(const std::string& filename) : filename_(filename) {} + + const TimezoneVariant& getVariant(int64_t clk) const override { + return getImpl()->getVariant(clk); + } + int64_t getEpoch() const override { + return getImpl()->getEpoch(); + } + void print(std::ostream& os) const override { + return getImpl()->print(os); + } + uint64_t getVersion() const override { + return getImpl()->getVersion(); + } + + int64_t convertToUTC(int64_t clk) const override { + return getImpl()->convertToUTC(clk); + } + + int64_t convertFromUTC(int64_t clk) const override { + return getImpl()->convertFromUTC(clk); + } + }; + + /** + * Get a timezone by absolute filename. + * Results are cached. + */ + const Timezone& getTimezoneByFilename(const std::string& filename) { + // ORC-110 + std::lock_guard timezone_lock(timezone_mutex); + std::map >::iterator itr = timezoneCache.find(filename); + if (itr != timezoneCache.end()) { + return *(itr->second).get(); + } + timezoneCache[filename] = std::make_shared(filename); return *timezoneCache[filename].get(); } diff --git a/c++/src/TypeImpl.cc b/c++/src/TypeImpl.cc index c7b073c713..18c4985ab1 100644 --- a/c++/src/TypeImpl.cc +++ b/c++/src/TypeImpl.cc @@ -19,8 +19,10 @@ #include "TypeImpl.hh" #include "Adaptor.hh" #include "orc/Exceptions.hh" +#include "orc/Type.hh" #include +#include #include namespace orc { @@ -62,6 +64,33 @@ namespace orc { subtypeCount_ = 0; } + TypeImpl::TypeImpl(TypeKind kind, const std::string& crs) { + parent_ = nullptr; + columnId_ = -1; + maximumColumnId_ = -1; + kind_ = kind; + maxLength_ = 0; + precision_ = 0; + scale_ = 0; + subtypeCount_ = 0; + crs_ = crs; + edgeInterpolationAlgorithm_ = geospatial::EdgeInterpolationAlgorithm::SPHERICAL; + } + + TypeImpl::TypeImpl(TypeKind kind, const std::string& crs, + geospatial::EdgeInterpolationAlgorithm algo) { + parent_ = nullptr; + columnId_ = -1; + maximumColumnId_ = -1; + kind_ = kind; + maxLength_ = 0; + precision_ = 0; + scale_ = 0; + subtypeCount_ = 0; + crs_ = crs; + edgeInterpolationAlgorithm_ = algo; + } + uint64_t TypeImpl::assignIds(uint64_t root) const { columnId_ = static_cast(root); uint64_t current = root + 1; @@ -120,6 +149,14 @@ namespace orc { return scale_; } + const std::string& TypeImpl::getCrs() const { + return crs_; + } + + geospatial::EdgeInterpolationAlgorithm TypeImpl::getAlgorithm() const { + return edgeInterpolationAlgorithm_; + } + Type& TypeImpl::setAttribute(const std::string& key, const std::string& value) { attributes_[key] = value; return *this; @@ -189,6 +226,45 @@ namespace orc { return true; } + namespace geospatial { + std::string AlgoToString(EdgeInterpolationAlgorithm algo) { + switch (algo) { + case EdgeInterpolationAlgorithm::SPHERICAL: + return "speherial"; + case VINCENTY: + return "vincenty"; + case THOMAS: + return "thomas"; + case ANDOYER: + return "andoyer"; + case KARNEY: + return "karney"; + default: + throw InvalidArgument("Unknown algo"); + } + } + + EdgeInterpolationAlgorithm AlgoFromString(const std::string& algo) { + if (algo == "speherial") { + return EdgeInterpolationAlgorithm::SPHERICAL; + } + if (algo == "vincenty") { + return VINCENTY; + } + if (algo == "thomas") { + return THOMAS; + } + if (algo == "andoyer") { + return ANDOYER; + } + if (algo == "karney") { + return KARNEY; + } + throw InvalidArgument("Unknown algo: " + algo); + } + + } // namespace geospatial + std::string TypeImpl::toString() const { switch (static_cast(kind_)) { case BOOLEAN: @@ -271,6 +347,17 @@ namespace orc { result << "char(" << maxLength_ << ")"; return result.str(); } + case GEOMETRY: { + std::stringstream result; + result << "geometry(" << crs_ << ")"; + return result.str(); + } + case GEOGRAPHY: { + std::stringstream result; + result << "geography(" << crs_ << "," + << geospatial::AlgoToString(edgeInterpolationAlgorithm_) << ")"; + return result.str(); + } default: throw NotImplementedYet("Unknown type"); } @@ -322,6 +409,8 @@ namespace orc { case BINARY: case CHAR: case VARCHAR: + case GEOMETRY: + case GEOGRAPHY: return encoded ? std::make_unique(capacity, memoryPool) : std::make_unique(capacity, memoryPool); @@ -419,6 +508,15 @@ namespace orc { return std::make_unique(UNION); } + std::unique_ptr createGeometryType(const std::string& crs) { + return std::make_unique(GEOMETRY, crs); + } + + std::unique_ptr createGeographyType(const std::string& crs, + geospatial::EdgeInterpolationAlgorithm algo) { + return std::make_unique(GEOGRAPHY, crs, algo); + } + std::string printProtobufMessage(const google::protobuf::Message& message); std::unique_ptr convertType(const proto::Type& type, const proto::Footer& footer) { std::unique_ptr ret; @@ -443,6 +541,16 @@ namespace orc { ret = std::make_unique(static_cast(type.kind()), type.maximum_length()); break; + case proto::Type_Kind_GEOMETRY: + ret = std::make_unique(static_cast(type.kind()), type.crs()); + break; + + case proto::Type_Kind_GEOGRAPHY: + ret = std::make_unique( + static_cast(type.kind()), type.crs(), + static_cast(type.algorithm())); + break; + case proto::Type_Kind_DECIMAL: ret = std::make_unique(DECIMAL, type.precision(), type.scale()); break; @@ -523,6 +631,13 @@ namespace orc { case CHAR: result = std::make_unique(fileType->getKind(), fileType->getMaximumLength()); break; + case GEOMETRY: + result = std::make_unique(fileType->getKind(), fileType->getCrs()); + break; + case GEOGRAPHY: + result = std::make_unique(fileType->getKind(), fileType->getCrs(), + fileType->getAlgorithm()); + break; case LIST: result = std::make_unique(fileType->getKind()); @@ -660,7 +775,8 @@ namespace orc { std::pair nameRes = parseName(input, pos, end); pos = nameRes.second; if (input[pos] != ':') { - throw std::logic_error("Invalid struct type. No field name set."); + throw std::logic_error("Invalid struct type. Field name can not contain '" + + std::string(1, input[pos]) + "'."); } std::pair, size_t> typeRes = TypeImpl::parseType(input, ++pos, end); result->addStructField(nameRes.first, std::move(typeRes.first)); @@ -709,6 +825,22 @@ namespace orc { return std::make_unique(DECIMAL, precision, scale); } + std::unique_ptr TypeImpl::parseGeographyType(const std::string& input, size_t start, + size_t end) { + if (input[start] != '(') { + throw std::logic_error("Missing ( after geography."); + } + size_t pos = start + 1; + size_t sep = input.find(',', pos); + if (sep + 1 >= end || sep == std::string::npos) { + throw std::logic_error("Geography type must specify CRS."); + } + std::string crs = input.substr(pos, sep - pos); + std::string algoStr = input.substr(sep + 1, end - sep - 1); + geospatial::EdgeInterpolationAlgorithm algo = geospatial::AlgoFromString(algoStr); + return std::make_unique(GEOGRAPHY, crs, algo); + } + void validatePrimitiveType(std::string category, const std::string& input, const size_t pos) { if (input[pos] == '<' || input[pos] == '(') { std::ostringstream oss; @@ -779,6 +911,14 @@ namespace orc { uint64_t maxLength = static_cast(atoi(input.substr(start + 1, end - start + 1).c_str())); return std::make_unique(CHAR, maxLength); + } else if (category == "geometry") { + if (input[start] != '(') { + throw std::logic_error("Missing ( after geometry."); + } + std::string crs = input.substr(start + 1, end - start + 1); + return std::make_unique(GEOMETRY, crs); + } else if (category == "geography") { + return parseGeographyType(input, start, end); } else { throw std::logic_error("Unknown type " + category); } diff --git a/c++/src/TypeImpl.hh b/c++/src/TypeImpl.hh index 647d5a5d2c..2db175aba6 100644 --- a/c++/src/TypeImpl.hh +++ b/c++/src/TypeImpl.hh @@ -24,6 +24,7 @@ #include "Adaptor.hh" #include "wrap/orc-proto-wrapper.hh" +#include #include namespace orc { @@ -41,6 +42,9 @@ namespace orc { uint64_t precision_; uint64_t scale_; std::map attributes_; + std::string crs_; + geospatial::EdgeInterpolationAlgorithm edgeInterpolationAlgorithm_ = + geospatial::EdgeInterpolationAlgorithm::SPHERICAL; public: /** @@ -58,6 +62,16 @@ namespace orc { */ TypeImpl(TypeKind kind, uint64_t precision, uint64_t scale); + /** + * Create geometry type. + */ + TypeImpl(TypeKind kind, const std::string& crs); + + /** + * Create geography type. + */ + TypeImpl(TypeKind kind, const std::string& crs, geospatial::EdgeInterpolationAlgorithm algo); + uint64_t getColumnId() const override; uint64_t getMaximumColumnId() const override; @@ -76,6 +90,10 @@ namespace orc { uint64_t getScale() const override; + const std::string& getCrs() const override; + + geospatial::EdgeInterpolationAlgorithm getAlgorithm() const override; + Type& setAttribute(const std::string& key, const std::string& value) override; bool hasAttributeKey(const std::string& key) const override; @@ -176,6 +194,14 @@ namespace orc { static std::unique_ptr parseDecimalType(const std::string& input, size_t start, size_t end); + /** + * Parse geography type from string + * @param input the input string of a decimal type + * @param start start position of the input string + * @param end end position of the input string + */ + static std::unique_ptr parseGeographyType(const std::string& input, size_t start, + size_t end); /** * Parse type for a category * @param category type name diff --git a/c++/src/Utils.hh b/c++/src/Utils.hh index 4a609788f9..851d0af15c 100644 --- a/c++/src/Utils.hh +++ b/c++/src/Utils.hh @@ -21,6 +21,7 @@ #include #include +#include namespace orc { @@ -70,6 +71,75 @@ namespace orc { #define SCOPED_MINUS_STOPWATCH(METRICS_PTR, LATENCY_VAR) #endif + struct Utf8Utils { + /** + * Counts how many utf-8 chars of the input data + */ + static uint64_t charLength(const char* data, uint64_t length) { + uint64_t chars = 0; + for (uint64_t i = 0; i < length; i++) { + if (isUtfStartByte(data[i])) { + chars++; + } + } + return chars; + } + + /** + * Return the number of bytes required to read at most maxCharLength + * characters in full from a utf-8 encoded byte array provided + * by data. This does not validate utf-8 data, but + * operates correctly on already valid utf-8 data. + * + * @param maxCharLength number of characters required + * @param data the bytes of UTF-8 + * @param length the length of data to truncate + */ + static uint64_t truncateBytesTo(uint64_t maxCharLength, const char* data, uint64_t length) { + uint64_t chars = 0; + if (length <= maxCharLength) { + return length; + } + for (uint64_t i = 0; i < length; i++) { + if (isUtfStartByte(data[i])) { + chars++; + } + if (chars > maxCharLength) { + return i; + } + } + // everything fits + return length; + } + + /** + * Checks if b is the first byte of a UTF-8 character. + */ + inline static bool isUtfStartByte(char b) { + return (b & 0xC0) != 0x80; + } + + /** + * Find the start of the last character that ends in the current string. + * @param text the bytes of the utf-8 + * @param from the first byte location + * @param until the last byte location + * @return the index of the last character + */ + static uint64_t findLastCharacter(const char* text, uint64_t from, uint64_t until) { + uint64_t posn = until; + /* we don't expect characters more than 5 bytes */ + while (posn >= from) { + if (isUtfStartByte(text[posn])) { + return posn; + } + posn -= 1; + } + /* beginning of a valid char not found */ + throw std::logic_error("Could not truncate string, beginning of a valid char not found"); + } + }; + } // namespace orc #endif diff --git a/c++/src/Vector.cc b/c++/src/Vector.cc index bc44469959..49f47aeb03 100644 --- a/c++/src/Vector.cc +++ b/c++/src/Vector.cc @@ -34,6 +34,7 @@ namespace orc { notNull(pool, cap), hasNulls(false), isEncoded(false), + dictionaryDecoded(false), memoryPool(pool) { std::memset(notNull.data(), 1, capacity); } @@ -61,6 +62,13 @@ namespace orc { return false; } + void ColumnVectorBatch::decodeDictionary() { + if (dictionaryDecoded) return; + + decodeDictionaryImpl(); + dictionaryDecoded = true; + } + StringDictionary::StringDictionary(MemoryPool& pool) : dictionaryBlob(pool), dictionaryOffset(pool) { // PASS @@ -88,6 +96,17 @@ namespace orc { } } + void EncodedStringVectorBatch::decodeDictionaryImpl() { + size_t n = index.size(); + resize(n); + + for (size_t i = 0; i < n; ++i) { + if (!hasNulls || notNull[i]) { + dictionary->getValueByIndex(index[i], data[i], length[i]); + } + } + } + StringVectorBatch::StringVectorBatch(uint64_t capacity, MemoryPool& pool) : ColumnVectorBatch(capacity, pool), data(pool, capacity), @@ -174,6 +193,12 @@ namespace orc { return false; } + void StructVectorBatch::decodeDictionaryImpl() { + for (const auto& field : fields) { + field->decodeDictionary(); + } + } + ListVectorBatch::ListVectorBatch(uint64_t cap, MemoryPool& pool) : ColumnVectorBatch(cap, pool), offsets(pool, cap + 1) { offsets.zeroOut(); @@ -211,6 +236,10 @@ namespace orc { return true; } + void ListVectorBatch::decodeDictionaryImpl() { + elements->decodeDictionary(); + } + MapVectorBatch::MapVectorBatch(uint64_t cap, MemoryPool& pool) : ColumnVectorBatch(cap, pool), offsets(pool, cap + 1) { offsets.zeroOut(); @@ -251,6 +280,16 @@ namespace orc { return true; } + void MapVectorBatch::decodeDictionaryImpl() { + if (keys) { + keys->decodeDictionary(); + } + + if (elements) { + elements->decodeDictionary(); + } + } + UnionVectorBatch::UnionVectorBatch(uint64_t cap, MemoryPool& pool) : ColumnVectorBatch(cap, pool), tags(pool, cap), offsets(pool, cap) { tags.zeroOut(); @@ -310,6 +349,12 @@ namespace orc { return false; } + void UnionVectorBatch::decodeDictionaryImpl() { + for (const auto& child : children) { + child->decodeDictionary(); + } + } + Decimal64VectorBatch::Decimal64VectorBatch(uint64_t cap, MemoryPool& pool) : ColumnVectorBatch(cap, pool), precision(0), diff --git a/c++/src/Writer.cc b/c++/src/Writer.cc index fceac7c2fb..c235169cca 100644 --- a/c++/src/Writer.cc +++ b/c++/src/Writer.cc @@ -24,6 +24,7 @@ #include "Utils.hh" #include +#include namespace orc { @@ -46,6 +47,8 @@ namespace orc { WriterMetrics* metrics; bool useTightNumericVector; uint64_t outputBufferCapacity; + uint64_t memoryBlockSize; + bool alignBlockBoundToRowGroup; WriterOptionsPrivate() : fileVersion(FileVersion::v_0_12()) { // default to Hive_0_12 stripeSize = 64 * 1024 * 1024; // 64M @@ -67,6 +70,8 @@ namespace orc { metrics = nullptr; useTightNumericVector = false; outputBufferCapacity = 1024 * 1024; + memoryBlockSize = 64 * 1024; // 64K + alignBlockBoundToRowGroup = false; } }; @@ -287,6 +292,24 @@ namespace orc { return privateBits_->outputBufferCapacity; } + WriterOptions& WriterOptions::setMemoryBlockSize(uint64_t capacity) { + privateBits_->memoryBlockSize = capacity; + return *this; + } + + uint64_t WriterOptions::getMemoryBlockSize() const { + return privateBits_->memoryBlockSize; + } + + WriterOptions& WriterOptions::setAlignBlockBoundToRowGroup(bool alignBlockBoundToRowGroup) { + privateBits_->alignBlockBoundToRowGroup = alignBlockBoundToRowGroup; + return *this; + } + + bool WriterOptions::getAlignBlockBoundToRowGroup() const { + return privateBits_->alignBlockBoundToRowGroup; + } + Writer::~Writer() { // PASS } @@ -352,11 +375,16 @@ namespace orc { useTightNumericVector_ = opts.getUseTightNumericVector(); + if (options_.getCompressionBlockSize() % options_.getMemoryBlockSize() != 0) { + throw std::invalid_argument( + "Compression block size must be a multiple of memory block size."); + } + // compression stream for stripe footer, file footer and metadata - compressionStream_ = - createCompressor(options_.getCompression(), outStream_, options_.getCompressionStrategy(), - options_.getOutputBufferCapacity(), options_.getCompressionBlockSize(), - *options_.getMemoryPool(), options_.getWriterMetrics()); + compressionStream_ = createCompressor( + options_.getCompression(), outStream_, options_.getCompressionStrategy(), + options_.getOutputBufferCapacity(), options_.getCompressionBlockSize(), + options_.getMemoryBlockSize(), *options_.getMemoryPool(), options_.getWriterMetrics()); // uncompressed stream for post script bufferedStream_.reset(new BufferedOutputStream(*options_.getMemoryPool(), outStream_, @@ -385,6 +413,9 @@ namespace orc { stripeRows_ += chunkSize; if (indexRows_ >= rowIndexStride) { + if (options_.getAlignBlockBoundToRowGroup()) { + columnWriter_->finishStreams(); + } columnWriter_->createRowIndexEntry(); indexRows_ = 0; } @@ -437,7 +468,7 @@ namespace orc { // Write file header const static size_t magicIdLength = strlen(WriterImpl::magicId); { - SCOPED_STOPWATCH(options.getWriterMetrics(), IOBlockingLatencyUs, IOCount); + SCOPED_STOPWATCH(options_.getWriterMetrics(), IOBlockingLatencyUs, IOCount); outStream_->write(WriterImpl::magicId, magicIdLength); } currentOffset_ += magicIdLength; @@ -585,7 +616,7 @@ namespace orc { throw std::logic_error("Failed to write post script."); } unsigned char psLength = static_cast(bufferedStream_->flush()); - SCOPED_STOPWATCH(options.getWriterMetrics(), IOBlockingLatencyUs, IOCount); + SCOPED_STOPWATCH(options_.getWriterMetrics(), IOBlockingLatencyUs, IOCount); outStream_->write(&psLength, sizeof(unsigned char)); } @@ -672,6 +703,40 @@ namespace orc { protoType.set_kind(proto::Type_Kind_CHAR); break; } + case GEOMETRY: { + protoType.set_kind(proto::Type_Kind_GEOMETRY); + protoType.set_crs(t.getCrs()); + break; + } + case GEOGRAPHY: { + protoType.set_kind(proto::Type_Kind_GEOGRAPHY); + protoType.set_crs(t.getCrs()); + switch (t.getAlgorithm()) { + case geospatial::EdgeInterpolationAlgorithm::SPHERICAL: { + protoType.set_algorithm(proto::Type_EdgeInterpolationAlgorithm_SPHERICAL); + break; + } + case orc::geospatial::EdgeInterpolationAlgorithm::VINCENTY: { + protoType.set_algorithm(proto::Type_EdgeInterpolationAlgorithm_VINCENTY); + break; + } + case orc::geospatial::EdgeInterpolationAlgorithm::THOMAS: { + protoType.set_algorithm(proto::Type_EdgeInterpolationAlgorithm_VINCENTY); + break; + } + case orc::geospatial::EdgeInterpolationAlgorithm::ANDOYER: { + protoType.set_algorithm(proto::Type_EdgeInterpolationAlgorithm_ANDOYER); + break; + } + case orc::geospatial::EdgeInterpolationAlgorithm::KARNEY: { + protoType.set_algorithm(proto::Type_EdgeInterpolationAlgorithm_KARNEY); + break; + } + default: + throw std::invalid_argument("Unknown Algorithm."); + } + break; + } default: throw std::logic_error("Unknown type."); } diff --git a/c++/src/io/Cache.cc b/c++/src/io/Cache.cc new file mode 100644 index 0000000000..39f63fdd2b --- /dev/null +++ b/c++/src/io/Cache.cc @@ -0,0 +1,171 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "Cache.hh" + +namespace orc { + + std::vector ReadRangeCombiner::coalesce(std::vector ranges) const { + if (ranges.empty()) { + return ranges; + } + + // Remove zero-sized ranges + auto end = std::remove_if(ranges.begin(), ranges.end(), + [](const ReadRange& range) { return range.length == 0; }); + // Sort in position order + std::sort(ranges.begin(), end, [](const ReadRange& a, const ReadRange& b) { + return a.offset != b.offset ? a.offset < b.offset : a.length > b.length; + }); + + // Remove ranges that overlap 100% + std::vector uniqueRanges; + uniqueRanges.reserve(ranges.size()); + for (auto it = ranges.begin(); it != end; ++it) { + if (uniqueRanges.empty() || !uniqueRanges.back().contains(*it)) { + uniqueRanges.push_back(*it); + } + } + ranges = std::move(uniqueRanges); + + // Skip further processing if ranges is empty after removing zero-sized ranges. + if (ranges.empty()) { + return ranges; + } + +#ifndef NDEBUG + for (size_t i = 0; i < ranges.size() - 1; ++i) { + const auto& left = ranges[i]; + const auto& right = ranges[i + 1]; + assert(left.offset < right.offset); + assert(!left.contains(right)); + } +#endif + + std::vector coalesced; + auto itr = ranges.begin(); + + // Start of the current coalesced range and end (exclusive) of previous range. + // Both are initialized with the start of first range which is a placeholder value. + uint64_t coalescedStart = itr->offset; + uint64_t coalescedEnd = coalescedStart + itr->length; + + for (++itr; itr < ranges.end(); ++itr) { + const uint64_t currentRangeStart = itr->offset; + const uint64_t currentRangeEnd = currentRangeStart + itr->length; + + assert(coalescedStart < coalescedEnd); + assert(currentRangeStart < currentRangeEnd); + + // At this point, the coalesced range is [coalesced_start, prev_range_end). + // Stop coalescing if: + // - coalesced range is too large, or + // - distance (hole/gap) between consecutive ranges is too large. + if ((currentRangeEnd - coalescedStart > rangeSizeLimit) || + (currentRangeStart > coalescedEnd + holeSizeLimit)) { + coalesced.push_back({coalescedStart, coalescedEnd - coalescedStart}); + coalescedStart = currentRangeStart; + } + + // Update the prev_range_end with the current range. + coalescedEnd = currentRangeEnd; + } + coalesced.push_back({coalescedStart, coalescedEnd - coalescedStart}); + + assert(coalesced.front().offset == ranges.front().offset); + assert(coalesced.back().offset + coalesced.back().length == + ranges.back().offset + ranges.back().length); + return coalesced; + } + + std::vector ReadRangeCombiner::coalesceReadRanges(std::vector ranges, + uint64_t holeSizeLimit, + uint64_t rangeSizeLimit) { + assert(rangeSizeLimit > holeSizeLimit); + + ReadRangeCombiner combiner{holeSizeLimit, rangeSizeLimit}; + return combiner.coalesce(std::move(ranges)); + } + + void ReadRangeCache::cache(std::vector ranges) { + ranges = ReadRangeCombiner::coalesceReadRanges(std::move(ranges), options_.holeSizeLimit, + options_.rangeSizeLimit); + + std::vector newEntries = makeCacheEntries(ranges); + // Add new entries, themselves ordered by offset + if (entries_.size() > 0) { + std::vector merged(entries_.size() + newEntries.size()); + std::merge(entries_.begin(), entries_.end(), newEntries.begin(), newEntries.end(), + merged.begin()); + entries_ = std::move(merged); + } else { + entries_ = std::move(newEntries); + } + } + + BufferSlice ReadRangeCache::read(const ReadRange& range) { + if (range.length == 0) { + return {std::make_shared(*memoryPool_, 0), 0, 0}; + } + + const auto it = std::lower_bound(entries_.begin(), entries_.end(), range, + [](const RangeCacheEntry& entry, const ReadRange& range) { + return entry.range.offset + entry.range.length < + range.offset + range.length; + }); + + BufferSlice result{}; + bool hit_cache = false; + if (it != entries_.end() && it->range.contains(range)) { + hit_cache = it->future.valid(); + it->future.get(); + result = BufferSlice{it->buffer, range.offset - it->range.offset, range.length}; + } + + if (metrics_) { + if (hit_cache) + metrics_->ReadRangeCacheHits.fetch_add(1); + else + metrics_->ReadRangeCacheMisses.fetch_add(1); + } + return result; + } + + void ReadRangeCache::evictEntriesBefore(uint64_t boundary) { + auto it = std::lower_bound(entries_.begin(), entries_.end(), boundary, + [](const RangeCacheEntry& entry, uint64_t offset) { + return entry.range.offset + entry.range.length <= offset; + }); + entries_.erase(entries_.begin(), it); + } + + std::vector ReadRangeCache::makeCacheEntries( + const std::vector& ranges) const { + std::vector newEntries; + newEntries.reserve(ranges.size()); + for (const auto& range : ranges) { + BufferPtr buffer = std::make_shared(*memoryPool_, range.length); + std::future future = stream_->readAsync(buffer->data(), buffer->size(), range.offset); + newEntries.emplace_back(range, std::move(buffer), std::move(future)); + } + return newEntries; + } + +} // namespace orc diff --git a/c++/src/io/Cache.hh b/c++/src/io/Cache.hh new file mode 100644 index 0000000000..7fc79718aa --- /dev/null +++ b/c++/src/io/Cache.hh @@ -0,0 +1,122 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "orc/MemoryPool.hh" +#include "orc/OrcFile.hh" + +#include +#include +#include +#include +#include +#include + +namespace orc { + + struct ReadRange { + uint64_t offset; + uint64_t length; + + ReadRange() = default; + ReadRange(uint64_t offset, uint64_t length) : offset(offset), length(length) {} + + friend bool operator==(const ReadRange& left, const ReadRange& right) { + return (left.offset == right.offset && left.length == right.length); + } + friend bool operator!=(const ReadRange& left, const ReadRange& right) { + return !(left == right); + } + + bool contains(const ReadRange& other) const { + return (offset <= other.offset && offset + length >= other.offset + other.length); + } + }; + + struct ReadRangeCombiner { + const uint64_t holeSizeLimit; + const uint64_t rangeSizeLimit; + + std::vector coalesce(std::vector ranges) const; + + static std::vector coalesceReadRanges(std::vector ranges, + uint64_t holeSizeLimit, + uint64_t rangeSizeLimit); + }; + + using Buffer = DataBuffer; + using BufferPtr = std::shared_ptr; + + struct RangeCacheEntry { + ReadRange range; + BufferPtr buffer; + std::shared_future future; // use shared_future in case of multiple get calls + + RangeCacheEntry() = default; + RangeCacheEntry(const ReadRange& range, BufferPtr buffer, std::future future) + : range(range), buffer(std::move(buffer)), future(std::move(future).share()) {} + + friend bool operator<(const RangeCacheEntry& left, const RangeCacheEntry& right) { + return left.range.offset < right.range.offset; + } + }; + + struct BufferSlice { + BufferPtr buffer = nullptr; + uint64_t offset = 0; + uint64_t length = 0; + }; + + /// A read cache designed to hide IO latencies when reading. + class ReadRangeCache { + public: + /// Construct a read cache with given options + explicit ReadRangeCache(InputStream* stream, CacheOptions options, MemoryPool* memoryPool, + ReaderMetrics* metrics = nullptr) + : stream_(stream), + options_(std::move(options)), + memoryPool_(memoryPool), + metrics_(metrics) {} + + ~ReadRangeCache() = default; + + /// Cache the given ranges in the background. + /// + /// The caller must ensure that the ranges do not overlap with each other, + /// nor with previously cached ranges. Otherwise, behaviour will be undefined. + void cache(std::vector ranges); + + /// Read a range previously given to Cache(). + BufferSlice read(const ReadRange& range); + + /// Evict cache entries with its range before given boundary. + void evictEntriesBefore(uint64_t boundary); + + private: + std::vector makeCacheEntries(const std::vector& ranges) const; + + InputStream* stream_; + CacheOptions options_; + // Ordered by offset (so as to find a matching region by binary search) + std::vector entries_; + MemoryPool* memoryPool_; + ReaderMetrics* metrics_; + }; + +} // namespace orc diff --git a/c++/src/io/OutputStream.cc b/c++/src/io/OutputStream.cc index 6fc68e262f..fbf1ca61dd 100644 --- a/c++/src/io/OutputStream.cc +++ b/c++/src/io/OutputStream.cc @@ -61,6 +61,10 @@ namespace orc { } } + void BufferedOutputStream::finishStream() { + // PASS + } + google::protobuf::int64 BufferedOutputStream::ByteCount() const { return static_cast(dataBuffer_->size()); } @@ -87,7 +91,7 @@ namespace orc { uint64_t dataSize = dataBuffer_->size(); // flush data buffer into outputStream if (dataSize > 0) { - SCOPED_STOPWATCH(metrics, IOBlockingLatencyUs, IOCount); + SCOPED_STOPWATCH(metrics_, IOBlockingLatencyUs, IOCount); dataBuffer_->writeTo(outputStream_, metrics_); } dataBuffer_->resize(0); @@ -98,6 +102,10 @@ namespace orc { dataBuffer_->resize(0); } + uint64_t BufferedOutputStream::getRawInputBufferSize() const { + throw std::logic_error("getRawInputBufferSize is not supported."); + } + void AppendOnlyBufferedStream::write(const char* data, size_t size) { size_t dataOffset = 0; while (size > 0) { @@ -120,25 +128,31 @@ namespace orc { } uint64_t AppendOnlyBufferedStream::flush() { - outStream_->BackUp(bufferLength_ - bufferOffset_); - bufferOffset_ = bufferLength_ = 0; - buffer_ = nullptr; + finishStream(); return outStream_->flush(); } void AppendOnlyBufferedStream::recordPosition(PositionRecorder* recorder) const { uint64_t flushedSize = outStream_->getSize(); - uint64_t unflushedSize = static_cast(bufferOffset_); + uint64_t unusedBufferSize = static_cast(bufferLength_ - bufferOffset_); if (outStream_->isCompressed()) { // start of the compression chunk in the stream recorder->add(flushedSize); - // number of decompressed bytes that need to be consumed - recorder->add(unflushedSize); + // There are multiple blocks in the input buffer, but bufferPosition only records the + // effective length of the last block. We need rawInputBufferSize to record the total length + // of all variable blocks. + recorder->add(outStream_->getRawInputBufferSize() - unusedBufferSize); } else { - flushedSize -= static_cast(bufferLength_); // byte offset of the start location - recorder->add(flushedSize + unflushedSize); + recorder->add(flushedSize - unusedBufferSize); } } + void AppendOnlyBufferedStream::finishStream() { + outStream_->BackUp(bufferLength_ - bufferOffset_); + outStream_->finishStream(); + bufferOffset_ = bufferLength_ = 0; + buffer_ = nullptr; + } + } // namespace orc diff --git a/c++/src/io/OutputStream.hh b/c++/src/io/OutputStream.hh index c63bc805bb..6319de96d6 100644 --- a/c++/src/io/OutputStream.hh +++ b/c++/src/io/OutputStream.hh @@ -69,10 +69,12 @@ namespace orc { virtual uint64_t getSize() const; virtual uint64_t flush(); virtual void suppress(); + virtual uint64_t getRawInputBufferSize() const; virtual bool isCompressed() const { return false; } + virtual void finishStream(); }; DIAGNOSTIC_POP @@ -98,6 +100,7 @@ namespace orc { void write(const char* data, size_t size); uint64_t getSize() const; uint64_t flush(); + void finishStream(); void recordPosition(PositionRecorder* recorder) const; }; diff --git a/c++/src/meson.build b/c++/src/meson.build new file mode 100644 index 0000000000..44a98500f3 --- /dev/null +++ b/c++/src/meson.build @@ -0,0 +1,201 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +compiler = meson.get_compiler('cpp') +has_pread = compiler.compiles(''' + #include + #include + int main(int,char*[]){ + int f = open("/x/y", O_RDONLY); + char buf[100]; + return pread(f, buf, 100, 1000) == 0; + } +''') + +has_strptime = compiler.compiles(''' + #include + int main(int,char*[]){ + struct tm time2020; + return !strptime("2020-02-02 12:34:56", "%Y-%m-%d %H:%M:%S", &time2020); + } +''') + +has_builtin_overflow_check = compiler.compiles(''' + int main(){ + int a; + return __builtin_add_overflow(1, 2, &a); + } +''') + +has_diagnostic_push = compiler.compiles(''' + #ifdef __clang__ + #pragma clang diagnostic push + #pragma clang diagnostic ignored "-Wdeprecated" + #pragma clang diagnostic pop + #elif defined(__GNUC__) + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wdeprecated" + #pragma GCC diagnostic pop + #elif defined(_MSC_VER) + #pragma warning( push ) + #pragma warning( disable : 4996 ) + #pragma warning( pop ) + #else + unknownCompiler! + #endif + int main(int, char *[]) {} +''') + +has_std_isnan = compiler.compiles(''' + #include + int main(int, char *[]) { + return std::isnan(1.0f); + } +''') + +has_double_to_string = compiler.compiles(''' + #include + int main(int, char *[]) { + double d = 5; + std::to_string(d); + } +''') + +has_int64_to_string = compiler.compiles(''' + #include + #include + int main(int, char *[]) { + int64_t d = 5; + std::to_string(d); + } +''') + +has_pre_1970 = compiler.run(''' + #include + int main(int, char *[]) { + time_t t = -14210715; // 1969-07-20 12:34:45 + struct tm *ptm = gmtime(&t); + return !(ptm && ptm->tm_year == 69); + } +''') + +has_post_2038 = compiler.run(''' + #include + #include + int main(int, char *[]) { + setenv("TZ", "America/Los_Angeles", 1); + tzset(); + struct tm time2037; + struct tm time2038; + strptime("2037-05-05 12:34:56", "%Y-%m-%d %H:%M:%S", &time2037); + strptime("2038-05-05 12:34:56", "%Y-%m-%d %H:%M:%S", &time2038); + return (mktime(&time2038) - mktime(&time2037)) <= 31500000; + } +''') + +cdata = configuration_data() +cdata.set10('HAS_PREAD', has_pread) +cdata.set10('HAS_STRPTIME', has_strptime) +cdata.set10('HAS_DIAGNOSTIC_PUSH', has_diagnostic_push) +cdata.set10('HAS_DOUBLE_TO_STRING', has_double_to_string) +cdata.set10('HAS_INT64_TO_STRING', has_int64_to_string) +cdata.set('HAS_PRE_1970', has_pre_1970.returncode() == 0) +cdata.set('HAS_POST_2038', has_post_2038.returncode() == 0) +cdata.set10('HAS_STD_ISNAN', has_std_isnan) +cdata.set10('HAS_BUILTIN_OVERFLOW_CHECK', has_builtin_overflow_check) +cdata.set10('NEEDS_Z_PREFIX', false) # Meson zlib subproject does not need this + +adaptor_header = configure_file( + input: 'Adaptor.hh.in', + output: 'Adaptor.hh', + configuration: cdata, + format: 'cmake', +) + +source_files = [adaptor_header] +source_files += files( + 'io/InputStream.cc', + 'io/OutputStream.cc', + 'io/Cache.cc', + 'sargs/ExpressionTree.cc', + 'sargs/Literal.cc', + 'sargs/PredicateLeaf.cc', + 'sargs/SargsApplier.cc', + 'sargs/SearchArgument.cc', + 'sargs/TruthValue.cc', + 'wrap/orc-proto-wrapper.cc', + 'Adaptor.cc', + 'BlockBuffer.cc', + 'BloomFilter.cc', + 'BpackingDefault.cc', + 'ByteRLE.cc', + 'ColumnPrinter.cc', + 'ColumnReader.cc', + 'ColumnWriter.cc', + 'Common.cc', + 'Compression.cc', + 'ConvertColumnReader.cc', + 'CpuInfoUtil.cc', + 'Exceptions.cc', + 'Geospatial.cc', + 'Int128.cc', + 'LzoDecompressor.cc', + 'MemoryPool.cc', + 'Murmur3.cc', + 'OrcFile.cc', + 'Reader.cc', + 'RLEv1.cc', + 'RLEV2Util.cc', + 'RleDecoderV2.cc', + 'RleEncoderV2.cc', + 'RLE.cc', + 'SchemaEvolution.cc', + 'Statistics.cc', + 'StripeStream.cc', + 'Timezone.cc', + 'TypeImpl.cc', + 'Vector.cc', + 'Writer.cc', +) + +incdir = include_directories('../include') +orc_format_proto_dep = dependency('orc_format_proto') +# zstd requires us to add the threads +threads_dep = dependency('threads') + +orc_lib = library( + 'orc', + sources: source_files, + dependencies: [ + orc_format_proto_dep, + protobuf_dep, + zlib_dep, + snappy_dep, + lz4_dep, + zstd_dep, + threads_dep, + sparsehash_c11_dep, + ], + include_directories: incdir, + install: true, +) + +orc_dep = declare_dependency( + link_with: orc_lib, + include_directories: incdir, + dependencies: orc_format_proto_dep, +) diff --git a/c++/src/sargs/ExpressionTree.cc b/c++/src/sargs/ExpressionTree.cc index e49bca4b77..58dd13817d 100644 --- a/c++/src/sargs/ExpressionTree.cc +++ b/c++/src/sargs/ExpressionTree.cc @@ -110,6 +110,9 @@ namespace orc { return result; } case Operator::NOT: + if (children_.size() != 1) { + throw std::invalid_argument("NOT operator must have exactly one child"); + } return !children_.at(0)->evaluate(leaves); case Operator::LEAF: return leaves[leaf_]; @@ -159,6 +162,9 @@ namespace orc { sstream << ')'; break; case Operator::NOT: + if (children_.size() != 1) { + throw std::invalid_argument("NOT operator must have exactly one child"); + } sstream << "(not " << children_.at(0)->toString() << ')'; break; case Operator::LEAF: diff --git a/c++/src/sargs/PredicateLeaf.cc b/c++/src/sargs/PredicateLeaf.cc index d9df1c5d5c..5c77616836 100644 --- a/c++/src/sargs/PredicateLeaf.cc +++ b/c++/src/sargs/PredicateLeaf.cc @@ -701,6 +701,9 @@ namespace orc { } } + // files written by trino may lack of hasnull field. + if (!colStats.has_has_null()) return TruthValue::YES_NO_NULL; + bool allNull = colStats.has_null() && colStats.number_of_values() == 0; if (operator_ == Operator::IS_NULL || ((operator_ == Operator::EQUALS || operator_ == Operator::NULL_SAFE_EQUALS) && diff --git a/c++/src/sargs/SearchArgument.cc b/c++/src/sargs/SearchArgument.cc index 83d4af2435..ff0ba1e2d5 100644 --- a/c++/src/sargs/SearchArgument.cc +++ b/c++/src/sargs/SearchArgument.cc @@ -272,6 +272,12 @@ namespace orc { return *this; } + SearchArgumentBuilder& SearchArgumentBuilderImpl::maybe() { + TreeNode& parent = currTree_.front(); + parent->addChild(std::make_shared(TruthValue::YES_NO_NULL)); + return *this; + } + /** * Recursively explore the tree to find the leaves that are still reachable * after optimizations. diff --git a/c++/src/sargs/SearchArgument.hh b/c++/src/sargs/SearchArgument.hh index 1963c993d6..7d663f7349 100644 --- a/c++/src/sargs/SearchArgument.hh +++ b/c++/src/sargs/SearchArgument.hh @@ -275,6 +275,12 @@ namespace orc { */ std::unique_ptr build() override; + /** + * Add a maybe leaf to the current item on the stack. + * @return this + */ + SearchArgumentBuilder& maybe() override; + private: SearchArgumentBuilder& start(ExpressionTree::Operator op); size_t addLeaf(PredicateLeaf leaf); diff --git a/c++/test/CMakeLists.txt b/c++/test/CMakeLists.txt index b04055366c..b0ee48f38a 100644 --- a/c++/test/CMakeLists.txt +++ b/c++/test/CMakeLists.txt @@ -15,14 +15,15 @@ # specific language governing permissions and limitations # under the License. -include_directories( - ${PROJECT_SOURCE_DIR}/c++/src +set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX17_FLAGS} ${WARN_FLAGS}") + +add_library (orc-test-include INTERFACE) +target_include_directories (orc-test-include INTERFACE ${PROJECT_BINARY_DIR}/c++/include ${PROJECT_BINARY_DIR}/c++/src + ${PROJECT_SOURCE_DIR}/c++/src ) -set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX17_FLAGS} ${WARN_FLAGS}") - if(BUILD_ENABLE_AVX512) set(SIMD_TEST_SRCS TestRleVectorDecoder.cc) endif(BUILD_ENABLE_AVX512) @@ -55,13 +56,16 @@ add_executable (orc-test TestRleEncoder.cc TestRLEV2Util.cc TestSargsApplier.cc + TestStatistics.cc TestSearchArgument.cc TestSchemaEvolution.cc TestStripeIndexStatistics.cc TestTimestampStatistics.cc TestTimezone.cc TestType.cc + TestUtil.cc TestWriter.cc + TestCache.cc ${SIMD_TEST_SRCS} ) @@ -73,6 +77,8 @@ target_link_libraries (orc-test orc::zlib orc::gtest orc::gmock + orc::sparsehash + orc-test-include ) add_executable (create-test-files @@ -82,6 +88,7 @@ add_executable (create-test-files target_link_libraries (create-test-files orc orc::protobuf + orc-test-include ) if (TEST_VALGRIND_MEMCHECK) diff --git a/c++/test/MemoryInputStream.hh b/c++/test/MemoryInputStream.hh index e6ef55b6de..31333ae430 100644 --- a/c++/test/MemoryInputStream.hh +++ b/c++/test/MemoryInputStream.hh @@ -22,8 +22,6 @@ #include "io/InputStream.hh" #include "orc/OrcFile.hh" -#include - namespace orc { class MemoryInputStream : public InputStream { public: @@ -44,6 +42,11 @@ namespace orc { memcpy(buf, buffer_ + offset, length); } + std::future readAsync(void* buf, uint64_t length, uint64_t offset) override { + return std::async(std::launch::async, + [this, buf, length, offset] { this->read(buf, length, offset); }); + } + virtual const std::string& getName() const override { return name_; } diff --git a/c++/test/TestByteRle.cc b/c++/test/TestByteRle.cc index a822a61d6b..7717eab387 100644 --- a/c++/test/TestByteRle.cc +++ b/c++/test/TestByteRle.cc @@ -1263,7 +1263,7 @@ namespace orc { MemoryOutputStream memStream(capacity); std::unique_ptr encoder = createBooleanRleEncoder( createCompressor(CompressionKind_ZSTD, &memStream, CompressionStrategy_COMPRESSION, - capacity, blockSize, *getDefaultPool(), nullptr)); + capacity, blockSize, blockSize, *getDefaultPool(), nullptr)); encoder->add(data, numValues, nullptr); encoder->flush(); diff --git a/c++/test/TestCache.cc b/c++/test/TestCache.cc new file mode 100644 index 0000000000..496ba3ec90 --- /dev/null +++ b/c++/test/TestCache.cc @@ -0,0 +1,142 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "MemoryInputStream.hh" +#include "io/Cache.hh" + +#include "wrap/gmock.h" +#include "wrap/gtest-wrapper.h" + +namespace orc { + + TEST(TestReadRangeCombiner, testBasics) { + ReadRangeCombiner combinator{0, 100}; + /// Ranges with partial overlap and identical offsets + std::vector ranges{{0, 15}, {5, 11}, {5, 15}}; + std::vector result = combinator.coalesce(std::move(ranges)); + std::vector expect{{0, 20}}; + ASSERT_EQ(result, expect); + } + + TEST(TestCoalesceReadRanges, testBasics) { + auto check = [](std::vector ranges, std::vector expected) -> void { + const uint64_t holeSizeLimit = 9; + const uint64_t rangeSizeLimit = 99; + auto coalesced = ReadRangeCombiner::coalesceReadRanges(ranges, holeSizeLimit, rangeSizeLimit); + ASSERT_EQ(coalesced, expected); + }; + + check({}, {}); + // Zero sized range that ends up in empty list + check({{110, 0}}, {}); + // Combination on 1 zero sized range and 1 non-zero sized range + check({{110, 10}, {120, 0}}, {{110, 10}}); + // 1 non-zero sized range + check({{110, 10}}, {{110, 10}}); + // No holes + unordered ranges + check({{130, 10}, {110, 10}, {120, 10}}, {{110, 30}}); + // No holes + check({{110, 10}, {120, 10}, {130, 10}}, {{110, 30}}); + // Small holes only + check({{110, 11}, {130, 11}, {150, 11}}, {{110, 51}}); + // Large holes + check({{110, 10}, {130, 10}}, {{110, 10}, {130, 10}}); + check({{110, 11}, {130, 11}, {150, 10}, {170, 11}, {190, 11}}, {{110, 50}, {170, 31}}); + + // With zero-sized ranges + check({{110, 11}, {130, 0}, {130, 11}, {145, 0}, {150, 11}, {200, 0}}, {{110, 51}}); + + // No holes but large ranges + check({{110, 100}, {210, 100}}, {{110, 100}, {210, 100}}); + // Small holes and large range in the middle (*) + check({{110, 10}, {120, 11}, {140, 100}, {240, 11}, {260, 11}}, + {{110, 21}, {140, 100}, {240, 31}}); + // Mid-size ranges that would turn large after coalescing + check({{100, 50}, {150, 50}}, {{100, 50}, {150, 50}}); + check({{100, 30}, {130, 30}, {160, 30}, {190, 30}, {220, 30}}, {{100, 90}, {190, 60}}); + + // Same as (*) but unsorted + check({{140, 100}, {120, 11}, {240, 11}, {110, 10}, {260, 11}}, + {{110, 21}, {140, 100}, {240, 31}}); + + // Completely overlapping ranges should be eliminated + check({{20, 5}, {20, 5}, {21, 2}}, {{20, 5}}); + } + + TEST(TestReadRangeCache, testBasics) { + std::string data = "abcdefghijklmnopqrstuvwxyz"; + + CacheOptions options; + options.holeSizeLimit = 2; + options.rangeSizeLimit = 10; + + auto file = std::make_shared(data.data(), data.size()); + ReadRangeCache cache(file.get(), options, getDefaultPool()); + + cache.cache({{1, 2}, {3, 2}, {8, 2}, {20, 2}, {25, 0}}); + cache.cache({{10, 4}, {14, 0}, {15, 4}}); + + auto assert_slice_equal = [](const BufferSlice& slice, const std::string& expected) { + ASSERT_TRUE(slice.buffer); + ASSERT_EQ(expected, std::string_view(slice.buffer->data() + slice.offset, slice.length)); + }; + + BufferSlice slice; + + slice = cache.read({20, 2}); + assert_slice_equal(slice, "uv"); + + slice = cache.read({1, 2}); + assert_slice_equal(slice, "bc"); + + slice = cache.read({3, 2}); + assert_slice_equal(slice, "de"); + + slice = cache.read({8, 2}); + assert_slice_equal(slice, "ij"); + + slice = cache.read({10, 4}); + assert_slice_equal(slice, "klmn"); + + slice = cache.read({15, 4}); + assert_slice_equal(slice, "pqrs"); + + // Zero-sized + slice = cache.read({14, 0}); + assert_slice_equal(slice, ""); + slice = cache.read({25, 0}); + assert_slice_equal(slice, ""); + + // Non-cached ranges + ASSERT_FALSE(cache.read({20, 3}).buffer); + ASSERT_FALSE(cache.read({19, 3}).buffer); + ASSERT_FALSE(cache.read({0, 3}).buffer); + ASSERT_FALSE(cache.read({25, 2}).buffer); + + // Release cache entries before 10. After that cache entries would be: {10, 9}, {20, 2} + cache.evictEntriesBefore(15); + ASSERT_FALSE(cache.read({1, 2}).buffer); + ASSERT_FALSE(cache.read({8, 2}).buffer); + slice = cache.read({10, 4}); + assert_slice_equal(slice, "klmn"); + slice = cache.read({20, 2}); + assert_slice_equal(slice, "uv"); + } +} // namespace orc diff --git a/c++/test/TestColumnStatistics.cc b/c++/test/TestColumnStatistics.cc index 5cf2d9e41b..642a8019de 100644 --- a/c++/test/TestColumnStatistics.cc +++ b/c++/test/TestColumnStatistics.cc @@ -17,6 +17,7 @@ */ #include "Statistics.hh" +#include "TestUtil.hh" #include "orc/OrcFile.hh" #include "wrap/gmock.h" #include "wrap/gtest-wrapper.h" @@ -531,4 +532,345 @@ namespace orc { collectionStats->merge(*other); EXPECT_FALSE(collectionStats->hasTotalChildren()); } + + TEST(ColumnStatistics, TestGeospatialDefaults) { + std::unique_ptr geoStats(new GeospatialColumnStatisticsImpl()); + EXPECT_TRUE(geoStats->getGeospatialTypes().empty()); + auto bbox = geoStats->getBoundingBox(); + for (int i = 0; i < geospatial::MAX_DIMENSIONS; i++) { + EXPECT_TRUE(bbox.boundEmpty(i)); + EXPECT_TRUE(bbox.boundValid(i)); + } + EXPECT_EQ(" x: empty y: empty z: empty m: empty geometry_types: []", + geoStats->toString()); + } + + TEST(ColumnStatistics, TestGeospatialUpdate) { + std::unique_ptr geoStats(new GeospatialColumnStatisticsImpl()); + EXPECT_TRUE(geoStats->getGeospatialTypes().empty()); + const auto& bbox = geoStats->getBoundingBox(); + for (int i = 0; i < geospatial::MAX_DIMENSIONS; i++) { + EXPECT_TRUE(bbox.boundEmpty(i)); + EXPECT_TRUE(bbox.boundValid(i)); + } + EXPECT_EQ(geoStats->getGeospatialTypes().size(), 0); + + geospatial::BoundingBox::XYZM expectedMin; + geospatial::BoundingBox::XYZM expectedMax; + std::array expectedEmpty; + std::array expectedValid; + std::vector expectedTypes; + for (int i = 0; i < geospatial::MAX_DIMENSIONS; i++) { + expectedMin[i] = geospatial::INF; + expectedMax[i] = -geospatial::INF; + expectedEmpty[i] = true; + expectedValid[i] = true; + } + + auto Verify = [&]() { + EXPECT_EQ(expectedEmpty, geoStats->getBoundingBox().dimensionEmpty()); + EXPECT_EQ(expectedValid, geoStats->getBoundingBox().dimensionValid()); + EXPECT_EQ(expectedTypes, geoStats->getGeospatialTypes()); + for (int i = 0; i < geospatial::MAX_DIMENSIONS; i++) { + if (geoStats->getBoundingBox().boundValid(i)) { + EXPECT_EQ(expectedMin[i], geoStats->getBoundingBox().lowerBound()[i]); + EXPECT_EQ(expectedMax[i], geoStats->getBoundingBox().upperBound()[i]); + } else { + EXPECT_TRUE(std::isnan(geoStats->getBoundingBox().lowerBound()[i])); + EXPECT_TRUE(std::isnan(geoStats->getBoundingBox().upperBound()[i])); + } + } + }; + + // Update a xy point + std::string xy0 = MakeWKBPoint({10, 11}, false, false); + geoStats->update(xy0.c_str(), xy0.size()); + expectedMin[0] = expectedMax[0] = 10; + expectedMin[1] = expectedMax[1] = 11; + expectedEmpty[0] = expectedEmpty[1] = false; + expectedTypes.push_back(1); + Verify(); + + // Update a xyz point. + std::string xyz0 = MakeWKBPoint({11, 12, 13}, true, false); + geoStats->update(xyz0.c_str(), xyz0.size()); + expectedMax[0] = 11; + expectedMax[1] = 12; + expectedMin[2] = expectedMax[2] = 13; + expectedEmpty[2] = false; + expectedTypes.push_back(1001); + Verify(); + + // Update a xym point. + std::string xym0 = MakeWKBPoint({9, 10, 0, 11}, false, true); + geoStats->update(xym0.c_str(), xym0.size()); + expectedMin[0] = 9; + expectedMin[1] = 10; + expectedMin[3] = expectedMax[3] = 11; + expectedEmpty[3] = false; + expectedTypes.push_back(2001); + Verify(); + + // Update a xymz point. + std::string xymz0 = MakeWKBPoint({8, 9, 10, 12}, true, true); + geoStats->update(xymz0.c_str(), xymz0.size()); + expectedMin[0] = 8; + expectedMin[1] = 9; + expectedMin[2] = 10; + expectedMax[3] = 12; + expectedTypes.push_back(3001); + Verify(); + + // Update NaN to every dimension. + std::string xyzm1 = MakeWKBPoint( + {std::numeric_limits::quiet_NaN(), std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), std::numeric_limits::quiet_NaN()}, + true, false); + geoStats->update(xyzm1.c_str(), xyzm1.size()); + Verify(); + + // Update a invalid WKB + std::string invalidWKB; + geoStats->update(invalidWKB.c_str(), invalidWKB.size()); + expectedValid[0] = expectedValid[1] = expectedValid[2] = expectedValid[3] = false; + expectedTypes.clear(); + Verify(); + + // Update a xy point again + std::string xy1 = MakeWKBPoint({10, 11}, false, false); + geoStats->update(xy1.c_str(), xy1.size()); + Verify(); + } + + TEST(ColumnStatistics, TestGeospatialToProto) { + // Test Empty + std::unique_ptr geoStats(new GeospatialColumnStatisticsImpl()); + proto::ColumnStatistics pbStats; + geoStats->toProtoBuf(pbStats); + EXPECT_TRUE(pbStats.has_geospatial_statistics()); + EXPECT_EQ(0, pbStats.geospatial_statistics().geospatial_types().size()); + EXPECT_FALSE(pbStats.geospatial_statistics().has_bbox()); + + // Update a xy point + std::string xy = MakeWKBPoint({10, 11}, false, false); + geoStats->update(xy.c_str(), xy.size()); + pbStats.Clear(); + geoStats->toProtoBuf(pbStats); + EXPECT_TRUE(pbStats.has_geospatial_statistics()); + EXPECT_EQ(1, pbStats.geospatial_statistics().geospatial_types().size()); + EXPECT_EQ(1, pbStats.geospatial_statistics().geospatial_types(0)); + EXPECT_TRUE(pbStats.geospatial_statistics().has_bbox()); + const auto& bbox0 = pbStats.geospatial_statistics().bbox(); + EXPECT_TRUE(bbox0.has_xmin()); + EXPECT_TRUE(bbox0.has_xmax()); + EXPECT_TRUE(bbox0.has_ymin()); + EXPECT_TRUE(bbox0.has_ymax()); + EXPECT_FALSE(bbox0.has_zmin()); + EXPECT_FALSE(bbox0.has_zmax()); + EXPECT_FALSE(bbox0.has_mmin()); + EXPECT_FALSE(bbox0.has_mmax()); + EXPECT_EQ(10, bbox0.xmin()); + EXPECT_EQ(10, bbox0.xmax()); + EXPECT_EQ(11, bbox0.ymin()); + EXPECT_EQ(11, bbox0.ymax()); + + // Update a xyzm point. + std::string xyzm = MakeWKBPoint({-10, -11, -12, -13}, true, true); + geoStats->update(xyzm.c_str(), xyzm.size()); + pbStats.Clear(); + geoStats->toProtoBuf(pbStats); + EXPECT_TRUE(pbStats.has_geospatial_statistics()); + EXPECT_EQ(2, pbStats.geospatial_statistics().geospatial_types().size()); + EXPECT_EQ(1, pbStats.geospatial_statistics().geospatial_types(0)); + EXPECT_EQ(3001, pbStats.geospatial_statistics().geospatial_types(1)); + EXPECT_TRUE(pbStats.geospatial_statistics().has_bbox()); + const auto& bbox1 = pbStats.geospatial_statistics().bbox(); + EXPECT_TRUE(bbox1.has_xmin()); + EXPECT_TRUE(bbox1.has_xmax()); + EXPECT_TRUE(bbox1.has_ymin()); + EXPECT_TRUE(bbox1.has_ymax()); + EXPECT_TRUE(bbox1.has_zmin()); + EXPECT_TRUE(bbox1.has_zmax()); + EXPECT_TRUE(bbox1.has_mmin()); + EXPECT_TRUE(bbox1.has_mmax()); + EXPECT_EQ(-10, bbox1.xmin()); + EXPECT_EQ(10, bbox1.xmax()); + EXPECT_EQ(-11, bbox1.ymin()); + EXPECT_EQ(11, bbox1.ymax()); + EXPECT_EQ(-12, bbox1.zmin()); + EXPECT_EQ(-12, bbox1.zmax()); + EXPECT_EQ(-13, bbox1.mmin()); + EXPECT_EQ(-13, bbox1.mmax()); + + // Update a invalid point + std::string invalidWKB; + geoStats->update(invalidWKB.c_str(), invalidWKB.size()); + pbStats.Clear(); + geoStats->toProtoBuf(pbStats); + EXPECT_TRUE(pbStats.has_geospatial_statistics()); + EXPECT_EQ(0, pbStats.geospatial_statistics().geospatial_types().size()); + EXPECT_FALSE(pbStats.geospatial_statistics().has_bbox()); + } + + TEST(ColumnStatistics, TestGeospatialMerge) { + std::unique_ptr invalidStats( + new GeospatialColumnStatisticsImpl()); + invalidStats->update("0", 0); + + std::unique_ptr emptyStats( + new GeospatialColumnStatisticsImpl()); + + std::unique_ptr xyStats(new GeospatialColumnStatisticsImpl()); + std::string xy = MakeWKBPoint({10, 11}, false, false); + xyStats->update(xy.c_str(), xy.size()); + + std::unique_ptr xyzStats(new GeospatialColumnStatisticsImpl()); + std::string xyz = MakeWKBPoint({12, 13, 14}, true, false); + xyzStats->update(xyz.c_str(), xyz.size()); + + std::unique_ptr xyzmStats(new GeospatialColumnStatisticsImpl()); + std::string xyzm = MakeWKBPoint({-10, -11, -12, -13}, true, true); + xyzmStats->update(xyzm.c_str(), xyzm.size()); + + // invalid merge invalid + invalidStats->merge(*invalidStats); + std::array expectedValid = {false, false, false, false}; + EXPECT_EQ(invalidStats->getBoundingBox().dimensionValid(), expectedValid); + EXPECT_EQ(invalidStats->getGeospatialTypes().size(), 0); + + // Empty merge empty + emptyStats->merge(*emptyStats); + expectedValid = {true, true, true, true}; + std::array expectedEmpty = {true, true, true, true}; + EXPECT_EQ(emptyStats->getBoundingBox().dimensionValid(), expectedValid); + EXPECT_EQ(emptyStats->getBoundingBox().dimensionEmpty(), expectedEmpty); + EXPECT_EQ(emptyStats->getGeospatialTypes().size(), 0); + + // Empty merge xy + emptyStats->merge(*xyStats); + expectedEmpty = {false, false, true, true}; + EXPECT_EQ(emptyStats->getBoundingBox().dimensionValid(), expectedValid); + EXPECT_EQ(emptyStats->getBoundingBox().dimensionEmpty(), expectedEmpty); + EXPECT_EQ(10, emptyStats->getBoundingBox().lowerBound()[0]); + EXPECT_EQ(10, emptyStats->getBoundingBox().upperBound()[0]); + EXPECT_EQ(11, emptyStats->getBoundingBox().lowerBound()[1]); + EXPECT_EQ(11, emptyStats->getBoundingBox().upperBound()[1]); + EXPECT_EQ(emptyStats->getGeospatialTypes().size(), 1); + EXPECT_EQ(emptyStats->getGeospatialTypes()[0], 1); + + // Empty merge xyz + emptyStats->merge(*xyzStats); + expectedEmpty = {false, false, false, true}; + EXPECT_EQ(emptyStats->getBoundingBox().dimensionValid(), expectedValid); + EXPECT_EQ(emptyStats->getBoundingBox().dimensionEmpty(), expectedEmpty); + EXPECT_EQ(10, emptyStats->getBoundingBox().lowerBound()[0]); + EXPECT_EQ(12, emptyStats->getBoundingBox().upperBound()[0]); + EXPECT_EQ(11, emptyStats->getBoundingBox().lowerBound()[1]); + EXPECT_EQ(13, emptyStats->getBoundingBox().upperBound()[1]); + EXPECT_EQ(14, emptyStats->getBoundingBox().lowerBound()[2]); + EXPECT_EQ(14, emptyStats->getBoundingBox().upperBound()[2]); + EXPECT_EQ(emptyStats->getGeospatialTypes().size(), 2); + EXPECT_EQ(emptyStats->getGeospatialTypes()[0], 1); + EXPECT_EQ(emptyStats->getGeospatialTypes()[1], 1001); + + // Empty merge xyzm + emptyStats->merge(*xyzmStats); + expectedEmpty = {false, false, false, false}; + EXPECT_EQ(emptyStats->getBoundingBox().dimensionValid(), expectedValid); + EXPECT_EQ(emptyStats->getBoundingBox().dimensionEmpty(), expectedEmpty); + EXPECT_EQ(-10, emptyStats->getBoundingBox().lowerBound()[0]); + EXPECT_EQ(12, emptyStats->getBoundingBox().upperBound()[0]); + EXPECT_EQ(-11, emptyStats->getBoundingBox().lowerBound()[1]); + EXPECT_EQ(13, emptyStats->getBoundingBox().upperBound()[1]); + EXPECT_EQ(-12, emptyStats->getBoundingBox().lowerBound()[2]); + EXPECT_EQ(14, emptyStats->getBoundingBox().upperBound()[2]); + EXPECT_EQ(-13, emptyStats->getBoundingBox().lowerBound()[3]); + EXPECT_EQ(-13, emptyStats->getBoundingBox().upperBound()[3]); + EXPECT_EQ(emptyStats->getGeospatialTypes().size(), 3); + EXPECT_EQ(emptyStats->getGeospatialTypes()[0], 1); + EXPECT_EQ(emptyStats->getGeospatialTypes()[1], 1001); + EXPECT_EQ(emptyStats->getGeospatialTypes()[2], 3001); + + // Empty merge invalid + emptyStats->merge(*invalidStats); + expectedValid = {false, false, false, false}; + EXPECT_EQ(emptyStats->getBoundingBox().dimensionValid(), expectedValid); + EXPECT_EQ(emptyStats->getGeospatialTypes().size(), 0); + } + + TEST(ColumnStatistics, TestGeospatialFromProto) { + proto::ColumnStatistics pbStats; + // No geostats + + std::unique_ptr emptyStats0( + new GeospatialColumnStatisticsImpl(pbStats)); + std::array expectedValid = {false, false, false, false}; + EXPECT_TRUE(emptyStats0->getGeospatialTypes().empty()); + EXPECT_EQ(emptyStats0->getBoundingBox().dimensionValid(), expectedValid); + + // Add empty geostats + pbStats.mutable_geospatial_statistics(); + std::unique_ptr emptyStats1( + new GeospatialColumnStatisticsImpl(pbStats)); + EXPECT_TRUE(emptyStats1->getGeospatialTypes().empty()); + EXPECT_EQ(emptyStats1->getBoundingBox().dimensionValid(), expectedValid); + + // Set xy bounds + auto* geoProtoStas = pbStats.mutable_geospatial_statistics(); + geoProtoStas->mutable_bbox()->set_xmin(0); + geoProtoStas->mutable_bbox()->set_xmax(1); + geoProtoStas->mutable_bbox()->set_ymin(0); + geoProtoStas->mutable_bbox()->set_ymax(1); + geoProtoStas->mutable_geospatial_types()->Add(2); + std::unique_ptr xyStats( + new GeospatialColumnStatisticsImpl(pbStats)); + expectedValid = {true, true, false, false}; + EXPECT_EQ(xyStats->getGeospatialTypes().size(), 1); + EXPECT_EQ(xyStats->getGeospatialTypes()[0], 2); + EXPECT_EQ(xyStats->getBoundingBox().dimensionValid(), expectedValid); + EXPECT_EQ(0, xyStats->getBoundingBox().lowerBound()[0]); + EXPECT_EQ(1, xyStats->getBoundingBox().upperBound()[0]); + EXPECT_EQ(0, xyStats->getBoundingBox().lowerBound()[1]); + EXPECT_EQ(1, xyStats->getBoundingBox().upperBound()[1]); + + // Set xyz bounds + geoProtoStas->mutable_bbox()->set_zmin(0); + geoProtoStas->mutable_bbox()->set_zmax(1); + geoProtoStas->mutable_geospatial_types()->Add(1003); + std::unique_ptr xyzStats( + new GeospatialColumnStatisticsImpl(pbStats)); + expectedValid = {true, true, true, false}; + EXPECT_EQ(xyzStats->getGeospatialTypes().size(), 2); + EXPECT_EQ(xyzStats->getGeospatialTypes()[0], 2); + EXPECT_EQ(xyzStats->getGeospatialTypes()[1], 1003); + EXPECT_EQ(xyzStats->getBoundingBox().dimensionValid(), expectedValid); + EXPECT_EQ(0, xyzStats->getBoundingBox().lowerBound()[0]); + EXPECT_EQ(1, xyzStats->getBoundingBox().upperBound()[0]); + EXPECT_EQ(0, xyzStats->getBoundingBox().lowerBound()[1]); + EXPECT_EQ(1, xyzStats->getBoundingBox().upperBound()[1]); + EXPECT_EQ(0, xyzStats->getBoundingBox().lowerBound()[2]); + EXPECT_EQ(1, xyzStats->getBoundingBox().upperBound()[2]); + + // Set xyzm bounds + geoProtoStas->mutable_bbox()->set_mmin(0); + geoProtoStas->mutable_bbox()->set_mmax(1); + geoProtoStas->mutable_geospatial_types()->Add(3003); + std::unique_ptr xyzmStats( + new GeospatialColumnStatisticsImpl(pbStats)); + expectedValid = {true, true, true, true}; + EXPECT_EQ(xyzmStats->getGeospatialTypes().size(), 3); + EXPECT_EQ(xyzmStats->getGeospatialTypes()[0], 2); + EXPECT_EQ(xyzmStats->getGeospatialTypes()[1], 1003); + EXPECT_EQ(xyzmStats->getGeospatialTypes()[2], 3003); + EXPECT_EQ(xyzmStats->getBoundingBox().dimensionValid(), expectedValid); + EXPECT_EQ(0, xyzmStats->getBoundingBox().lowerBound()[0]); + EXPECT_EQ(1, xyzmStats->getBoundingBox().upperBound()[0]); + EXPECT_EQ(0, xyzmStats->getBoundingBox().lowerBound()[1]); + EXPECT_EQ(1, xyzmStats->getBoundingBox().upperBound()[1]); + EXPECT_EQ(0, xyzmStats->getBoundingBox().lowerBound()[2]); + EXPECT_EQ(1, xyzmStats->getBoundingBox().upperBound()[2]); + EXPECT_EQ(0, xyzmStats->getBoundingBox().lowerBound()[3]); + EXPECT_EQ(1, xyzmStats->getBoundingBox().upperBound()[3]); + } + } // namespace orc diff --git a/c++/test/TestCompression.cc b/c++/test/TestCompression.cc index a77800a3dd..e95a6f0169 100644 --- a/c++/test/TestCompression.cc +++ b/c++/test/TestCompression.cc @@ -42,12 +42,12 @@ namespace orc { } void decompressAndVerify(const MemoryOutputStream& memStream, CompressionKind kind, - const char* data, size_t size, MemoryPool& pool) { + const char* data, size_t size, MemoryPool& pool, uint64_t capacity) { auto inputStream = std::make_unique(memStream.getData(), memStream.getLength()); std::unique_ptr decompressStream = - createDecompressor(kind, std::move(inputStream), 1024, pool, getDefaultReaderMetrics()); + createDecompressor(kind, std::move(inputStream), capacity, pool, getDefaultReaderMetrics()); const char* decompressedBuffer; int decompressedSize; @@ -66,7 +66,7 @@ namespace orc { CompressionStrategy strategy, uint64_t capacity, uint64_t block, MemoryPool& pool, const char* data, size_t dataSize) { std::unique_ptr compressStream = - createCompressor(kind, outStream, strategy, capacity, block, pool, nullptr); + createCompressor(kind, outStream, strategy, capacity, block, block, pool, nullptr); size_t pos = 0; char* compressBuffer; @@ -99,7 +99,7 @@ namespace orc { char testData[] = "hello world!"; compressAndVerify(kind, &memStream, CompressionStrategy_SPEED, capacity, block, *pool, testData, sizeof(testData)); - decompressAndVerify(memStream, kind, testData, sizeof(testData), *pool); + decompressAndVerify(memStream, kind, testData, sizeof(testData), *pool, capacity); } TEST(TestCompression, zlib_compress_original_string) { @@ -117,7 +117,7 @@ namespace orc { char testData[] = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; compressAndVerify(kind, &memStream, CompressionStrategy_SPEED, capacity, block, *pool, testData, sizeof(testData)); - decompressAndVerify(memStream, kind, testData, sizeof(testData), *pool); + decompressAndVerify(memStream, kind, testData, sizeof(testData), *pool, capacity); } TEST(TestCompression, compress_simple_repeated_string) { @@ -138,7 +138,7 @@ namespace orc { } compressAndVerify(kind, &memStream, CompressionStrategy_SPEED, capacity, block, *pool, testData, 170); - decompressAndVerify(memStream, kind, testData, 170, *pool); + decompressAndVerify(memStream, kind, testData, 170, *pool, capacity); } TEST(TestCompression, zlib_compress_two_blocks) { @@ -158,7 +158,7 @@ namespace orc { generateRandomData(testData, dataSize, true); compressAndVerify(kind, &memStream, CompressionStrategy_SPEED, capacity, block, *pool, testData, dataSize); - decompressAndVerify(memStream, kind, testData, dataSize, *pool); + decompressAndVerify(memStream, kind, testData, dataSize, *pool, capacity); delete[] testData; } @@ -179,7 +179,7 @@ namespace orc { generateRandomData(testData, dataSize, false); compressAndVerify(kind, &memStream, CompressionStrategy_SPEED, capacity, block, *pool, testData, dataSize); - decompressAndVerify(memStream, kind, testData, dataSize, *pool); + decompressAndVerify(memStream, kind, testData, dataSize, *pool, capacity); delete[] testData; } @@ -205,7 +205,7 @@ namespace orc { } std::unique_ptr compressStream = createCompressor( - kind, &memStream, CompressionStrategy_SPEED, capacity, block, *pool, nullptr); + kind, &memStream, CompressionStrategy_SPEED, capacity, block, block, *pool, nullptr); EXPECT_TRUE(ps.SerializeToZeroCopyStream(compressStream.get())); compressStream->flush(); @@ -213,8 +213,8 @@ namespace orc { auto inputStream = std::make_unique(memStream.getData(), memStream.getLength()); - std::unique_ptr decompressStream = - createDecompressor(kind, std::move(inputStream), 1024, *pool, getDefaultReaderMetrics()); + std::unique_ptr decompressStream = createDecompressor( + kind, std::move(inputStream), capacity, *pool, getDefaultReaderMetrics()); proto::PostScript ps2; ps2.ParseFromZeroCopyStream(decompressStream.get()); @@ -312,7 +312,7 @@ namespace orc { uint64_t batchSize = 1024, blockSize = 256; AppendOnlyBufferedStream outStream(createCompressor( - kind, &memStream, strategy, DEFAULT_MEM_STREAM_SIZE, blockSize, *pool, nullptr)); + kind, &memStream, strategy, DEFAULT_MEM_STREAM_SIZE, blockSize, blockSize, *pool, nullptr)); // write 3 batches of data and record positions between every batch size_t row = 0; @@ -335,7 +335,7 @@ namespace orc { auto inputStream = std::make_unique(memStream.getData(), memStream.getLength()); std::unique_ptr decompressStream = createDecompressor( - kind, std::move(inputStream), blockSize, *pool, getDefaultReaderMetrics()); + kind, std::move(inputStream), DEFAULT_MEM_STREAM_SIZE, *pool, getDefaultReaderMetrics()); // prepare positions to seek to EXPECT_EQ(rowIndexEntry1.positions_size(), rowIndexEntry2.positions_size()); diff --git a/c++/test/TestConvertColumnReader.cc b/c++/test/TestConvertColumnReader.cc index 83798289db..6096fe4573 100644 --- a/c++/test/TestConvertColumnReader.cc +++ b/c++/test/TestConvertColumnReader.cc @@ -27,6 +27,7 @@ #include "ConvertColumnReader.hh" #include "MemoryInputStream.hh" #include "MemoryOutputStream.hh" +#include namespace orc { @@ -650,6 +651,10 @@ namespace orc { auto& readC2 = dynamic_cast(*readStructBatch.fields[1]); auto& readC3 = dynamic_cast(*readStructBatch.fields[2]); auto& readC4 = dynamic_cast(*readStructBatch.fields[3]); + EXPECT_TRUE(9 == readC1.precision && 5 == readC1.scale); + EXPECT_TRUE(20 == readC2.precision && 5 == readC2.scale); + EXPECT_TRUE(10 == readC3.precision && 3 == readC3.scale); + EXPECT_TRUE(19 == readC4.precision && 3 == readC4.scale); EXPECT_EQ(TEST_CASES, readBatch->numElements); for (int i = 0; i < TEST_CASES / 2; i++) { size_t idx = static_cast(i); @@ -815,4 +820,411 @@ namespace orc { } } + TEST(ConvertColumnReader, TestConvertStringVariantToNumeric) { + constexpr int DEFAULT_MEM_STREAM_SIZE = 10 * 1024 * 1024; + constexpr int TEST_CASES = 6; + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); + std::unique_ptr fileType( + Type::buildTypeFromString("struct")); + std::shared_ptr readType(Type::buildTypeFromString("struct")); + WriterOptions options; + auto writer = createWriter(*fileType, &memStream, options); + auto batch = writer->createRowBatch(TEST_CASES); + auto structBatch = dynamic_cast(batch.get()); + auto& c1 = dynamic_cast(*structBatch->fields[0]); + auto& c2 = dynamic_cast(*structBatch->fields[1]); + auto& c3 = dynamic_cast(*structBatch->fields[2]); + std::vector raw1{"", "123456", "0", "-1234567890", "999999999999999999999999", + "error"}; + std::vector raw2{"", "123456", "0", "-1234567890", "999999999999999999999999", + "error"}; + std::vector raw3{ + "", "123456", "-0.0", "-123456789.0123", "1000000000000000000000000000000000000000", + "error"}; + + c1.notNull[0] = c2.notNull[0] = c3.notNull[0] = false; + for (int i = 1; i < TEST_CASES; i++) { + c1.data[i] = raw1[i].data(); + c1.length[i] = raw1[i].length(); + c1.notNull[i] = true; + + c2.data[i] = raw2[i].data(); + c2.length[i] = raw2[i].length(); + c2.notNull[i] = true; + + c3.data[i] = raw3[i].data(); + c3.length[i] = raw3[i].length(); + c3.notNull[i] = true; + } + + structBatch->numElements = c1.numElements = c2.numElements = c3.numElements = TEST_CASES; + structBatch->hasNulls = c1.hasNulls = c2.hasNulls = c3.hasNulls = true; + writer->add(*batch); + writer->close(); + auto inStream = std::make_unique(memStream.getData(), memStream.getLength()); + auto pool = getDefaultPool(); + auto reader = createReader(*pool, std::move(inStream)); + RowReaderOptions rowReaderOptions; + rowReaderOptions.setUseTightNumericVector(true); + rowReaderOptions.setReadType(readType); + auto rowReader = reader->createRowReader(rowReaderOptions); + auto readBatch = rowReader->createRowBatch(TEST_CASES); + EXPECT_EQ(true, rowReader->next(*readBatch)); + + auto& readSturctBatch = dynamic_cast(*readBatch); + auto& readC1 = dynamic_cast(*readSturctBatch.fields[0]); + auto& readC2 = dynamic_cast(*readSturctBatch.fields[1]); + auto& readC3 = dynamic_cast(*readSturctBatch.fields[2]); + + EXPECT_FALSE(readC1.notNull[0]); + EXPECT_FALSE(readC2.notNull[0]); + EXPECT_FALSE(readC3.notNull[0]); + + for (int i = 1; i < 4; i++) { + EXPECT_TRUE(readC1.notNull[i]); + EXPECT_TRUE(readC2.notNull[i]); + EXPECT_TRUE(readC3.notNull[i]); + } + + for (int i = 4; i <= 5; i++) { + EXPECT_FALSE(readC1.notNull[i]) << i; + EXPECT_FALSE(readC2.notNull[i]) << i; + EXPECT_FALSE(readC3.notNull[i]) << i; + } + + EXPECT_EQ(readC1.data[1], 1); + EXPECT_EQ(readC2.data[1], 123456); + EXPECT_FLOAT_EQ(readC3.data[1], 123456); + + EXPECT_EQ(readC1.data[2], 0); + EXPECT_EQ(readC2.data[2], 0); + EXPECT_FLOAT_EQ(readC3.data[2], -0.0); + + EXPECT_EQ(readC1.data[3], 1); + EXPECT_EQ(readC2.data[3], -1234567890); + EXPECT_FLOAT_EQ(readC3.data[3], -123456789.0123); + } + + TEST(ConvertColumnReader, TestConvertStringVariant) { + constexpr int DEFAULT_MEM_STREAM_SIZE = 10 * 1024 * 1024; + constexpr int TEST_CASES = 4; + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); + std::unique_ptr fileType( + Type::buildTypeFromString("struct")); + std::shared_ptr readType( + Type::buildTypeFromString("struct")); + WriterOptions options; + auto writer = createWriter(*fileType, &memStream, options); + auto batch = writer->createRowBatch(TEST_CASES); + auto structBatch = dynamic_cast(batch.get()); + auto& c1 = dynamic_cast(*structBatch->fields[0]); + auto& c2 = dynamic_cast(*structBatch->fields[1]); + auto& c3 = dynamic_cast(*structBatch->fields[2]); + + std::vector raw1{"", "12345", "1", "1234"}; + std::vector raw2{"", "12345", "1", "1234"}; + std::vector raw3{"", "12345", "1", "1234"}; + + c1.notNull[0] = c2.notNull[0] = c3.notNull[0] = false; + for (int i = 1; i < TEST_CASES; i++) { + c1.data[i] = raw1[i].data(); + c1.length[i] = raw1[i].length(); + c1.notNull[i] = true; + + c2.data[i] = raw2[i].data(); + c2.length[i] = raw2[i].length(); + c2.notNull[i] = true; + + c3.data[i] = raw3[i].data(); + c3.length[i] = raw3[i].length(); + c3.notNull[i] = true; + } + structBatch->numElements = c1.numElements = c2.numElements = c3.numElements = TEST_CASES; + structBatch->hasNulls = c1.hasNulls = c2.hasNulls = c3.hasNulls = true; + writer->add(*batch); + writer->close(); + auto inStream = std::make_unique(memStream.getData(), memStream.getLength()); + auto pool = getDefaultPool(); + auto reader = createReader(*pool, std::move(inStream)); + RowReaderOptions rowReaderOptions; + rowReaderOptions.setUseTightNumericVector(true); + rowReaderOptions.setReadType(readType); + auto rowReader = reader->createRowReader(rowReaderOptions); + auto readBatch = rowReader->createRowBatch(TEST_CASES); + EXPECT_EQ(true, rowReader->next(*readBatch)); + + auto& readSturctBatch = dynamic_cast(*readBatch); + auto& readC1 = dynamic_cast(*readSturctBatch.fields[0]); + auto& readC2 = dynamic_cast(*readSturctBatch.fields[1]); + auto& readC3 = dynamic_cast(*readSturctBatch.fields[2]); + + EXPECT_FALSE(readC1.notNull[0]); + EXPECT_FALSE(readC2.notNull[0]); + EXPECT_FALSE(readC3.notNull[0]); + + for (int i = 1; i < TEST_CASES; i++) { + EXPECT_TRUE(readC1.notNull[i]); + EXPECT_TRUE(readC2.notNull[i]); + EXPECT_TRUE(readC3.notNull[i]); + } + + EXPECT_EQ(std::string(readC1.data[1], readC1.length[1]), "12345"); + EXPECT_EQ(std::string(readC2.data[1], readC2.length[1]), "1234"); + EXPECT_EQ(std::string(readC3.data[1], readC3.length[1]), "1234"); + + EXPECT_EQ(std::string(readC1.data[2], readC1.length[2]), "1 "); + EXPECT_EQ(std::string(readC2.data[2], readC2.length[2]), "1 "); + EXPECT_EQ(std::string(readC3.data[2], readC3.length[2]), "1"); + + EXPECT_EQ(std::string(readC1.data[3], readC1.length[3]), "1234 "); + EXPECT_EQ(std::string(readC2.data[3], readC2.length[3]), "1234"); + EXPECT_EQ(std::string(readC3.data[3], readC3.length[3]), "1234"); + } + + // Returns year/month/day triple in civil calendar + // Preconditions: z is number of days since 1970-01-01 and is in the range: + // [numeric_limits::min(), numeric_limits::max()-719468]. + template + constexpr std::tuple civil_from_days(Int z) noexcept { + static_assert(std::numeric_limits::digits >= 18, + "This algorithm has not been ported to a 16 bit unsigned integer"); + static_assert(std::numeric_limits::digits >= 20, + "This algorithm has not been ported to a 16 bit signed integer"); + z += 719468; + const Int era = (z >= 0 ? z : z - 146096) / 146097; + const unsigned doe = static_cast(z - era * 146097); // [0, 146096] + const unsigned yoe = (doe - doe / 1460 + doe / 36524 - doe / 146096) / 365; // [0, 399] + const Int y = static_cast(yoe) + era * 400; + const unsigned doy = doe - (365 * yoe + yoe / 4 - yoe / 100); // [0, 365] + const unsigned mp = (5 * doy + 2) / 153; // [0, 11] + const unsigned d = doy - (153 * mp + 2) / 5 + 1; // [1, 31] + const unsigned m = mp < 10 ? mp + 3 : mp - 9; // [1, 12] + return std::tuple(y + (m <= 2), m, d); + } + + static std::string timestampToString(int64_t seconds, int64_t nanos, + const std::string& zoneName) { + auto& timezone = getTimezoneByName(zoneName); + seconds = timezone.convertToUTC(seconds); + time_t t = static_cast(seconds); + char buffer[100]; + constexpr auto SECOND_IN_DAY = 3600 * 24; + auto day = t < 0 ? (t - SECOND_IN_DAY + 1) / SECOND_IN_DAY : t / SECOND_IN_DAY; + + auto [y, m, d] = civil_from_days(day); + auto second_in_day = t % (3600 * 24); + if (second_in_day < 0) { + second_in_day += 3600 * 24; + } + auto h = second_in_day % (3600 * 24) / 3600; + auto min = second_in_day % 3600 / 60; + auto s = second_in_day % 60; + std::snprintf(buffer, sizeof(buffer), "%04d-%02d-%02d %02ld:%02ld:%02ld", y, m, d, h, min, s); + std::string result(buffer); + if (nanos) { + while (nanos % 10 == 0) nanos /= 10; + result = result + "." + std::to_string(nanos); + } + result = result + " " + zoneName; + return result; + } + + TEST(ConvertColumnReader, TestConvertStringVariantToTimestamp) { + constexpr int DEFAULT_MEM_STREAM_SIZE = 10 * 1024 * 1024; + constexpr int TEST_CASES = 1024; + const std::string writerTimezone = "America/New_York"; + const std::string readerTimezone = "Australia/Sydney"; + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); + std::unique_ptr fileType(Type::buildTypeFromString("struct")); + std::shared_ptr readType( + Type::buildTypeFromString("struct")); + WriterOptions options; + options.setTimezoneName(writerTimezone); + auto writer = createWriter(*fileType, &memStream, options); + auto batch = writer->createRowBatch(TEST_CASES); + auto structBatch = dynamic_cast(batch.get()); + auto& c1 = dynamic_cast(*structBatch->fields[0]); + auto& c2 = dynamic_cast(*structBatch->fields[1]); + + std::vector raw1, raw2; + raw1.reserve(TEST_CASES * 3); + raw2.reserve(TEST_CASES * 3); + std::vector ts1, ts2; + + for (int i = 0; i < TEST_CASES; i++) { + char buff[100]; + auto size = ::snprintf(buff, sizeof(buff), "%04d-%02d-27 12:34:56.789", 1960 + (i / 12), + (i % 12) + 1); + raw1.emplace_back(buff, size); + raw2.push_back(raw1.back() + " " + writerTimezone); + c1.data[i] = const_cast(raw1.back().c_str()); + c1.length[i] = raw1.back().length(); + c2.data[i] = const_cast(raw2.back().c_str()); + c2.length[i] = raw2.back().length(); + } + structBatch->numElements = c1.numElements = c2.numElements = TEST_CASES; + structBatch->hasNulls = c1.hasNulls = c2.hasNulls = false; + writer->add(*batch); + + for (int i = 0; i < TEST_CASES; i++) { + char buff[100]; + auto size = + ::snprintf(buff, sizeof(buff), "%04d-%02d-27 12:34:56", 1960 + (i / 12), (i % 12) + 1); + raw1.emplace_back(buff, size); + raw2.push_back(raw1.back() + " " + writerTimezone); + c1.data[i] = const_cast(raw1.back().c_str()); + c1.length[i] = raw1.back().length(); + c2.data[i] = const_cast(raw2.back().c_str()); + c2.length[i] = raw2.back().length(); + } + structBatch->numElements = c1.numElements = c2.numElements = TEST_CASES; + structBatch->hasNulls = c1.hasNulls = c2.hasNulls = false; + writer->add(*batch); + + { + raw1.push_back("2024?11-14 00:01:02"); + raw2.push_back("2024-01-02 03:04:05.678 tz/error"); + c1.data[0] = const_cast(raw1.back().c_str()); + c1.length[0] = raw1.back().length(); + c2.data[0] = const_cast(raw2.back().c_str()); + c2.length[0] = raw2.back().length(); + + c1.notNull[1] = false; + c2.notNull[1] = false; + + raw1.push_back("2024-12-14 00:01:02.-1"); + raw2.push_back("2024-01-02 03:04:05.678"); + c1.data[2] = const_cast(raw1.back().c_str()); + c1.length[2] = raw1.back().length(); + c2.data[2] = const_cast(raw2.back().c_str()); + c2.length[2] = raw2.back().length(); + } + structBatch->numElements = c1.numElements = c2.numElements = 3; + structBatch->hasNulls = c1.hasNulls = c2.hasNulls = true; + writer->add(*batch); + + writer->close(); + + auto inStream = std::make_unique(memStream.getData(), memStream.getLength()); + auto pool = getDefaultPool(); + auto reader = createReader(*pool, std::move(inStream)); + RowReaderOptions rowReaderOptions; + rowReaderOptions.setUseTightNumericVector(true); + rowReaderOptions.setReadType(readType); + rowReaderOptions.setTimezoneName(readerTimezone); + rowReaderOptions.throwOnSchemaEvolutionOverflow(true); + auto rowReader = reader->createRowReader(rowReaderOptions); + auto readBatch = rowReader->createRowBatch(TEST_CASES * 2); + EXPECT_EQ(true, rowReader->next(*readBatch)); + + auto& readSturctBatch = dynamic_cast(*readBatch); + auto& readC1 = dynamic_cast(*readSturctBatch.fields[0]); + auto& readC2 = dynamic_cast(*readSturctBatch.fields[1]); + + for (int i = 0; i < TEST_CASES * 2; i++) { + EXPECT_TRUE(readC1.notNull[i]); + EXPECT_TRUE(readC2.notNull[i]); + EXPECT_EQ(raw1[i] + " " + readerTimezone, + timestampToString(readC1.data[i], readC1.nanoseconds[i], readerTimezone)); + EXPECT_EQ(raw2[i], timestampToString(readC2.data[i], readC2.nanoseconds[i], writerTimezone)); + } + + rowReaderOptions.throwOnSchemaEvolutionOverflow(false); + rowReader = reader->createRowReader(rowReaderOptions); + EXPECT_EQ(true, rowReader->next(*readBatch)); + EXPECT_EQ(true, rowReader->next(*readBatch)); + EXPECT_EQ(3, readBatch->numElements); + EXPECT_FALSE(readC1.notNull[0]); + EXPECT_FALSE(readC2.notNull[0]); + EXPECT_FALSE(readC1.notNull[1]); + EXPECT_FALSE(readC2.notNull[1]); + EXPECT_FALSE(readC1.notNull[2]); + EXPECT_FALSE(readC2.notNull[2]); + } + + TEST(ConvertColumnReader, TestConvertStringVariantToDecimal) { + constexpr int DEFAULT_MEM_STREAM_SIZE = 10 * 1024 * 1024; + constexpr int TEST_CASES = 1024; + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); + std::unique_ptr fileType(Type::buildTypeFromString("struct")); + std::shared_ptr readType( + Type::buildTypeFromString("struct")); + WriterOptions options; + auto writer = createWriter(*fileType, &memStream, options); + auto batch = writer->createRowBatch(TEST_CASES); + auto structBatch = dynamic_cast(batch.get()); + auto& c1 = dynamic_cast(*structBatch->fields[0]); + auto& c2 = dynamic_cast(*structBatch->fields[1]); + + // + std::vector> rawDataAndExpected; + + rawDataAndExpected = { + /* 0 */ {"123456789012345678901234567890123456789", false, false, int64_t(), Int128()}, + /* 1 */ {"123456789012345678901234567890.1234567890", false, false, int64_t(), Int128()}, + /* 2 */ {"-123456789012345678901234567890.1234567890", false, false, int64_t(), Int128()}, + /* 3 */ {"-foo.bar", false, false, int64_t(), Int128()}, + /* 4 */ {"-foo.123", false, false, int64_t(), Int128()}, + /* 5 */ {"-123.foo", false, false, int64_t(), Int128()}, + /* 6 */ {"-123foo.123", false, false, int64_t(), Int128()}, + /* 7 */ {"-123.123foo", false, false, int64_t(), Int128()}, + /* 8 */ {"-.", false, false, int64_t(), Int128()}, + /* 9 */ {"-", false, false, int64_t(), Int128()}, + /* 10 */ {".", false, false, int64_t(), Int128()}, + /* 11 */ {"", false, false, int64_t(), Int128()}, + /* 12 */ {".12345", false, false, int64_t(), Int128()}, + /* 13 */ {"12345.", false, false, int64_t(), Int128()}, + /* 14 */ {"-1", true, true, -100000LL, Int128("-10000000000")}, + /* 15 */ {"-1.0", true, true, -100000LL, Int128("-10000000000")}, + /* 16 */ {"1", true, true, 100000, Int128("10000000000")}, + /* 17 */ {"1.0", true, true, 100000, Int128("10000000000")}, + /* 18 */ {"12345", true, true, 1234500000, Int128("123450000000000")}, + /* 19 */ {"12345.12345", true, true, 1234512345LL, Int128("123451234500000")}, + /* 20 */ {"-12345.12345", true, true, -1234512345LL, Int128("-123451234500000")}, + /* 21 */ {"1234567890", false, true, int64_t(), Int128("12345678900000000000")}, + /* 22 */ {"-1234567890", false, true, int64_t(), Int128("-12345678900000000000")}, + /* 23 */ {"1234567890.123", false, true, int64_t(), Int128("12345678901230000000")}, + /* 24 */ {"-1234567890.1234567", false, true, int64_t(), Int128("-12345678901234567000")}, + /* 25 */ {"1234567890123.12345", false, true, int64_t(), Int128("12345678901231234500000")}, + /* 26 */ + {"-1234567890123.12345678901", false, true, int64_t(), Int128("-12345678901231234567890")}}; + for (int i = 0; i < rawDataAndExpected.size(); i++) { + c1.data[i] = c2.data[i] = const_cast(std::get<0>(rawDataAndExpected[i]).c_str()); + c1.length[i] = c2.length[i] = std::get<0>(rawDataAndExpected[i]).length(); + } + + structBatch->numElements = c1.numElements = c2.numElements = rawDataAndExpected.size(); + structBatch->hasNulls = c1.hasNulls = c2.hasNulls = false; + writer->add(*batch); + writer->close(); + + auto inStream = std::make_unique(memStream.getData(), memStream.getLength()); + auto pool = getDefaultPool(); + auto reader = createReader(*pool, std::move(inStream)); + RowReaderOptions rowReaderOptions; + rowReaderOptions.setUseTightNumericVector(true); + rowReaderOptions.setReadType(readType); + auto rowReader = reader->createRowReader(rowReaderOptions); + auto readBatch = rowReader->createRowBatch(TEST_CASES); + EXPECT_EQ(true, rowReader->next(*readBatch)); + + auto& readSturctBatch = dynamic_cast(*readBatch); + auto& readC1 = dynamic_cast(*readSturctBatch.fields[0]); + auto& readC2 = dynamic_cast(*readSturctBatch.fields[1]); + EXPECT_EQ(readBatch->numElements, rawDataAndExpected.size()); + + for (int i = 0; i < readBatch->numElements; i++) { + bool expectedNotNull1 = std::get<1>(rawDataAndExpected[i]); + bool expectedNotNull2 = std::get<2>(rawDataAndExpected[i]); + EXPECT_EQ(expectedNotNull1, readC1.notNull[i]) << i; + EXPECT_EQ(expectedNotNull2, readC2.notNull[i]) << i; + if (expectedNotNull1) { + EXPECT_EQ(std::get<3>(rawDataAndExpected[i]), readC1.values[i]) << i; + } + if (expectedNotNull2) { + EXPECT_EQ(std::get<4>(rawDataAndExpected[i]), readC2.values[i]) << i; + } + } + } + } // namespace orc diff --git a/c++/test/TestDecompression.cc b/c++/test/TestDecompression.cc index dc6caeda0e..125c5e85a4 100644 --- a/c++/test/TestDecompression.cc +++ b/c++/test/TestDecompression.cc @@ -395,6 +395,26 @@ namespace orc { ASSERT_TRUE(!result->Next(&ptr, &length)); } + TEST_F(TestDecompression, testLzoOverflow) { + const unsigned char bad_lzo_data[] = {// Header: compressedSize = 12, original = false + 0x18, 0x00, 0x00, + + // LZO body: token and literal length extension + 0x00, // token: extended literal length + 0xFF, // extension byte 1 + + // Literal data: only 10 bytes far less than 273 + 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A'}; + + std::unique_ptr result = createDecompressor( + CompressionKind_LZO, + std::make_unique(bad_lzo_data, ARRAY_SIZE(bad_lzo_data)), + 128 * 1024, *getDefaultPool(), getDefaultReaderMetrics()); + const void* ptr; + int length; + EXPECT_THROW(result->Next(&ptr, &length), ParseError); + } + TEST_F(TestDecompression, testLz4Empty) { const unsigned char buffer[] = {0}; std::unique_ptr result = createDecompressor( @@ -545,7 +565,7 @@ namespace orc { *getDefaultPool(), getDefaultReaderMetrics()); const void* ptr; int length; - ASSERT_THROW(result->BackUp(20), std::logic_error); + ASSERT_THROW(result->BackUp(20), CompressionError); ASSERT_EQ(true, result->Next(&ptr, &length)); ASSERT_EQ(30, length); for (int i = 0; i < 10; ++i) { @@ -554,7 +574,7 @@ namespace orc { } } result->BackUp(10); - ASSERT_THROW(result->BackUp(2), std::logic_error); + ASSERT_THROW(result->BackUp(2), CompressionError); ASSERT_EQ(true, result->Next(&ptr, &length)); ASSERT_EQ(10, length); for (int i = 0; i < 10; ++i) { diff --git a/c++/test/TestDictionaryEncoding.cc b/c++/test/TestDictionaryEncoding.cc index f3dcaa0067..40c1b1a605 100644 --- a/c++/test/TestDictionaryEncoding.cc +++ b/c++/test/TestDictionaryEncoding.cc @@ -25,6 +25,7 @@ #include "wrap/gtest-wrapper.h" #include +#include #include namespace orc { @@ -53,6 +54,7 @@ namespace orc { WriterOptions options; options.setStripeSize(1024); + options.setMemoryBlockSize(64); options.setCompressionBlockSize(1024); options.setCompression(CompressionKind_ZLIB); options.setMemoryPool(pool); @@ -109,6 +111,7 @@ namespace orc { WriterOptions options; options.setStripeSize(1024); + options.setMemoryBlockSize(64); options.setCompressionBlockSize(1024); options.setCompression(CompressionKind_ZLIB); options.setMemoryPool(pool); @@ -171,6 +174,7 @@ namespace orc { WriterOptions options; options.setStripeSize(1024); options.setCompressionBlockSize(1024); + options.setMemoryBlockSize(64); options.setCompression(CompressionKind_ZLIB); options.setMemoryPool(pool); options.setDictionaryKeySizeThreshold(threshold); @@ -233,6 +237,7 @@ namespace orc { WriterOptions options; options.setStripeSize(1024); + options.setMemoryBlockSize(64); options.setCompressionBlockSize(1024); options.setCompression(CompressionKind_ZLIB); options.setMemoryPool(pool); @@ -302,7 +307,8 @@ namespace orc { WriterOptions options; options.setStripeSize(1); - options.setCompressionBlockSize(1024); + options.setMemoryBlockSize(1024); + options.setCompressionBlockSize(2 * 1024); options.setCompression(CompressionKind_ZLIB); options.setMemoryPool(pool); options.setDictionaryKeySizeThreshold(threshold); @@ -429,4 +435,57 @@ namespace orc { testDictionaryMultipleStripes(DICT_THRESHOLD, false); testDictionaryMultipleStripes(FALLBACK_THRESHOLD, false); } + + TEST(DictionaryEncoding, decodeDictionary) { + size_t rowCount = 8192; + size_t dictionarySize = 100; + auto* memoryPool = getDefaultPool(); + + auto encodedStringBatch = std::make_shared(rowCount, *memoryPool); + EXPECT_FALSE(encodedStringBatch->dictionaryDecoded); + encodedStringBatch->numElements = rowCount; + encodedStringBatch->hasNulls = true; + encodedStringBatch->isEncoded = true; + encodedStringBatch->dictionary = std::make_shared(*memoryPool); + + auto& dictionary = *encodedStringBatch->dictionary; + dictionary.dictionaryBlob.resize(3 * dictionarySize); + dictionary.dictionaryOffset.resize(dictionarySize + 1); + dictionary.dictionaryOffset[0] = 0; + for (uint64_t i = 0; i < dictionarySize; ++i) { + std::ostringstream oss; + oss << std::setw(3) << std::setfill('0') << i; + + auto str = oss.str(); + memcpy(&dictionary.dictionaryBlob[3 * i], str.data(), str.size()); + dictionary.dictionaryOffset[i + 1] = 3 * (i + 1); + } + + for (uint64_t i = 0; i < rowCount; ++i) { + if (i % 10 == 0) { + encodedStringBatch->notNull[i] = 0; + encodedStringBatch->index[i] = 0; + } else { + encodedStringBatch->notNull[i] = 1; + encodedStringBatch->index[i] = i % dictionarySize; + } + } + + encodedStringBatch->decodeDictionary(); + EXPECT_TRUE(encodedStringBatch->dictionaryDecoded); + EXPECT_EQ(0, encodedStringBatch->blob.size()); + + for (uint64_t i = 0; i < rowCount; ++i) { + if (encodedStringBatch->notNull[i]) { + auto index = encodedStringBatch->index[i]; + char* buf = nullptr; + int64_t buf_size = 0; + dictionary.getValueByIndex(index, buf, buf_size); + + EXPECT_EQ(buf, encodedStringBatch->data[i]); + EXPECT_EQ(buf_size, encodedStringBatch->length[i]); + } + } + } + } // namespace orc diff --git a/c++/test/TestInt128.cc b/c++/test/TestInt128.cc index 54dcff4567..be5b65b3a7 100644 --- a/c++/test/TestInt128.cc +++ b/c++/test/TestInt128.cc @@ -555,6 +555,11 @@ namespace orc { num = Int128("-12345678901122334455667788990011122233"); EXPECT_EQ("-12345678901122334455667788990011122233", num.toString()); + + num = Int128::maximumValue(); + EXPECT_EQ("170141183460469231731687303715884105727", num.toString()); + num = Int128::minimumValue(); + EXPECT_EQ("-170141183460469231731687303715884105728", num.toString()); } TEST(Int128, testToDecimalString) { diff --git a/c++/test/TestPredicateLeaf.cc b/c++/test/TestPredicateLeaf.cc index 2703776e39..3946123ec5 100644 --- a/c++/test/TestPredicateLeaf.cc +++ b/c++/test/TestPredicateLeaf.cc @@ -168,6 +168,12 @@ namespace orc { return colStats; } + static proto::ColumnStatistics createIncompleteNullStats() { + proto::ColumnStatistics colStats; + colStats.set_number_of_values(0); + return colStats; + } + static TruthValue evaluate(const PredicateLeaf& pred, const proto::ColumnStatistics& pbStats, const BloomFilter* bf = nullptr) { return pred.evaluate(WriterVersion_ORC_135, pbStats, bf); @@ -663,4 +669,10 @@ namespace orc { evaluate(pred8, createTimestampStats(2114380800, 1109000, 2114380800, 6789100))); } + TEST(TestPredicateLeaf, testLackOfSataistics) { + PredicateLeaf pred(PredicateLeaf::Operator::IS_NULL, PredicateDataType::STRING, 1, {}); + EXPECT_EQ(TruthValue::YES_NO, evaluate(pred, createStringStats("c", "d", true))); + EXPECT_EQ(TruthValue::YES_NO_NULL, evaluate(pred, createIncompleteNullStats())); + } + } // namespace orc diff --git a/c++/test/TestPredicatePushdown.cc b/c++/test/TestPredicatePushdown.cc index e949fc2898..5c8ed14e73 100644 --- a/c++/test/TestPredicatePushdown.cc +++ b/c++/test/TestPredicatePushdown.cc @@ -33,6 +33,7 @@ namespace orc { WriterOptions options; options.setStripeSize(1024 * 1024) .setCompressionBlockSize(1024) + .setMemoryBlockSize(64) .setCompression(CompressionKind_NONE) .setMemoryPool(pool) .setRowIndexStride(rowIndexStride); @@ -510,6 +511,7 @@ namespace orc { WriterOptions options; options.setStripeSize(1) .setCompressionBlockSize(1024) + .setMemoryBlockSize(64) .setCompression(CompressionKind_NONE) .setMemoryPool(pool) .setRowIndexStride(1000); diff --git a/c++/test/TestReader.cc b/c++/test/TestReader.cc index f709f693f1..f9df6edc92 100644 --- a/c++/test/TestReader.cc +++ b/c++/test/TestReader.cc @@ -155,7 +155,10 @@ namespace orc { ASSERT_THAT(rowReader->getSelectedColumns(), ElementsAreArray(expected)); } - std::unique_ptr createNestedListMemReader(MemoryOutputStream& memStream) { + std::unique_ptr createNestedListMemReader(MemoryOutputStream& memStream, + const std::vector& stripesToPrefetch, + const std::list& columnsToPrefetch, + bool prefetchTwice) { MemoryPool* pool = getDefaultPool(); auto type = std::unique_ptr( @@ -166,6 +169,7 @@ namespace orc { WriterOptions options; options.setStripeSize(1024 * 1024) .setCompressionBlockSize(1024) + .setMemoryBlockSize(64) .setCompression(CompressionKind_NONE) .setMemoryPool(pool) .setRowIndexStride(1000); @@ -217,20 +221,43 @@ namespace orc { auto inStream = std::make_unique(memStream.getData(), memStream.getLength()); ReaderOptions readerOptions; readerOptions.setMemoryPool(*pool); - return createReader(std::move(inStream), readerOptions); + auto reader = createReader(std::move(inStream), readerOptions); + + reader->preBuffer(stripesToPrefetch, columnsToPrefetch); + if (prefetchTwice) { + reader->preBuffer(stripesToPrefetch, columnsToPrefetch); + } + + return reader; } - TEST(TestReadIntent, testListAll) { + class TestReadIntentFromNestedList + : public ::testing::TestWithParam< + std::tuple, std::list, bool>> {}; + + TEST_P(TestReadIntentFromNestedList, testListAll) { + const auto& params = GetParam(); + const std::vector& stripesToPrefetch = std::get<0>(params); + const std::list& columnsToPrefetch = std::get<1>(params); + bool prefetchTwice = std::get<2>(params); + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); - std::unique_ptr reader = createNestedListMemReader(memStream); + std::unique_ptr reader = + createNestedListMemReader(memStream, stripesToPrefetch, columnsToPrefetch, prefetchTwice); // select all of int_array. verifySelection(reader, {{1, ReadIntent_ALL}}, {0, 1, 2}); } - TEST(TestReadIntent, testListOffsets) { + TEST_P(TestReadIntentFromNestedList, testListOffsets) { + const auto& params = GetParam(); + const std::vector& stripesToPrefetch = std::get<0>(params); + const std::list& columnsToPrefetch = std::get<1>(params); + bool prefetchTwice = std::get<2>(params); + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); - std::unique_ptr reader = createNestedListMemReader(memStream); + std::unique_ptr reader = + createNestedListMemReader(memStream, stripesToPrefetch, columnsToPrefetch, prefetchTwice); // select only the offsets of int_array. verifySelection(reader, {{1, ReadIntent_OFFSETS}}, {0, 1}); @@ -243,26 +270,44 @@ namespace orc { verifySelection(reader, {{3, ReadIntent_OFFSETS}, {5, ReadIntent_OFFSETS}}, {0, 3, 4, 5}); } - TEST(TestReadIntent, testListAllAndOffsets) { + TEST_P(TestReadIntentFromNestedList, testListAllAndOffsets) { + const auto& params = GetParam(); + const std::vector& stripesToPrefetch = std::get<0>(params); + const std::list& columnsToPrefetch = std::get<1>(params); + bool prefetchTwice = std::get<2>(params); + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); - std::unique_ptr reader = createNestedListMemReader(memStream); + std::unique_ptr reader = + createNestedListMemReader(memStream, stripesToPrefetch, columnsToPrefetch, prefetchTwice); // select all of int_array and only the outermost offsets of int_array_array_array. verifySelection(reader, {{1, ReadIntent_ALL}, {3, ReadIntent_OFFSETS}}, {0, 1, 2, 3}); } - TEST(TestReadIntent, testListConflictingIntent) { + TEST_P(TestReadIntentFromNestedList, testListConflictingIntent) { + const auto& params = GetParam(); + const std::vector& stripesToPrefetch = std::get<0>(params); + const std::list& columnsToPrefetch = std::get<1>(params); + bool prefetchTwice = std::get<2>(params); + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); - std::unique_ptr reader = createNestedListMemReader(memStream); + std::unique_ptr reader = + createNestedListMemReader(memStream, stripesToPrefetch, columnsToPrefetch, prefetchTwice); // test conflicting ReadIntent on nested list. verifySelection(reader, {{3, ReadIntent_OFFSETS}, {5, ReadIntent_ALL}}, {0, 3, 4, 5, 6}); verifySelection(reader, {{3, ReadIntent_ALL}, {5, ReadIntent_OFFSETS}}, {0, 3, 4, 5, 6}); } - TEST(TestReadIntent, testRowBatchContent) { + TEST_P(TestReadIntentFromNestedList, testRowBatchContent) { + const auto& params = GetParam(); + const std::vector& stripesToPrefetch = std::get<0>(params); + const std::list& columnsToPrefetch = std::get<1>(params); + bool prefetchTwice = std::get<2>(params); + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); - std::unique_ptr reader = createNestedListMemReader(memStream); + std::unique_ptr reader = + createNestedListMemReader(memStream, stripesToPrefetch, columnsToPrefetch, prefetchTwice); // select all of int_array and only the offsets of int_array_array. RowReaderOptions::IdReadIntentMap idReadIntentMap = {{1, ReadIntent_ALL}, @@ -298,7 +343,24 @@ namespace orc { EXPECT_EQ(nullptr, intArrayArrayArrayBatch.elements); } - std::unique_ptr createNestedMapMemReader(MemoryOutputStream& memStream) { + INSTANTIATE_TEST_SUITE_P( + TestReadIntentFromNestedListInstance, TestReadIntentFromNestedList, + ::testing::Values( + std::make_tuple(std::vector{}, std::list{}, true), + std::make_tuple(std::vector{}, std::list{}, false), + std::make_tuple(std::vector{}, std::list{1, 3}, true), + std::make_tuple(std::vector{}, std::list{1, 3}, false), + std::make_tuple(std::vector{0}, std::list{}, true), + std::make_tuple(std::vector{0}, std::list{}, false), + std::make_tuple(std::vector{0}, std::list{1, 3}, true), + std::make_tuple(std::vector{0}, std::list{1, 3}, false), + std::make_tuple(std::vector{1000}, std::list{1000}, true), + std::make_tuple(std::vector{1000}, std::list{1000}, false))); + + std::unique_ptr createNestedMapMemReader(MemoryOutputStream& memStream, + const std::vector& stripesToPrefetch, + const std::list& columnsToPrefetch, + bool prefetchTwice) { MemoryPool* pool = getDefaultPool(); auto type = std::unique_ptr( @@ -310,6 +372,7 @@ namespace orc { WriterOptions options; options.setStripeSize(1024 * 1024) .setCompressionBlockSize(1024) + .setMemoryBlockSize(64) .setCompression(CompressionKind_NONE) .setMemoryPool(pool) .setRowIndexStride(1000); @@ -387,20 +450,42 @@ namespace orc { auto inStream = std::make_unique(memStream.getData(), memStream.getLength()); ReaderOptions readerOptions; readerOptions.setMemoryPool(*pool); - return createReader(std::move(inStream), readerOptions); + auto reader = createReader(std::move(inStream), readerOptions); + + reader->preBuffer(stripesToPrefetch, columnsToPrefetch); + if (prefetchTwice) { + reader->preBuffer(stripesToPrefetch, columnsToPrefetch); + } + return reader; } - TEST(TestReadIntent, testMapAll) { + class TestReadIntentFromNestedMap + : public ::testing::TestWithParam< + std::tuple, std::list, bool>> {}; + + TEST_P(TestReadIntentFromNestedMap, testMapAll) { + const auto& params = GetParam(); + const std::vector& stripesToPrefetch = std::get<0>(params); + const std::list& columnsToPrefetch = std::get<1>(params); + bool prefetchTwice = std::get<2>(params); + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); - std::unique_ptr reader = createNestedMapMemReader(memStream); + std::unique_ptr reader = + createNestedMapMemReader(memStream, stripesToPrefetch, columnsToPrefetch, prefetchTwice); // select all of single_map. verifySelection(reader, {{2, ReadIntent_ALL}}, {0, 2, 3, 4}); } - TEST(TestReadIntent, testMapOffsets) { + TEST_P(TestReadIntentFromNestedMap, testMapOffsets) { + const auto& params = GetParam(); + const std::vector& stripesToPrefetch = std::get<0>(params); + const std::list& columnsToPrefetch = std::get<1>(params); + bool prefetchTwice = std::get<2>(params); + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); - std::unique_ptr reader = createNestedMapMemReader(memStream); + std::unique_ptr reader = + createNestedMapMemReader(memStream, stripesToPrefetch, columnsToPrefetch, prefetchTwice); // select only the offsets of single_map. verifySelection(reader, {{2, ReadIntent_OFFSETS}}, {0, 2}); @@ -412,17 +497,29 @@ namespace orc { verifySelection(reader, {{5, ReadIntent_OFFSETS}, {9, ReadIntent_OFFSETS}}, {0, 5, 7, 9}); } - TEST(TestReadIntent, testMapAllAndOffsets) { + TEST_P(TestReadIntentFromNestedMap, testMapAllAndOffsets) { + const auto& params = GetParam(); + const std::vector& stripesToPrefetch = std::get<0>(params); + const std::list& columnsToPrefetch = std::get<1>(params); + bool prefetchTwice = std::get<2>(params); + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); - std::unique_ptr reader = createNestedMapMemReader(memStream); + std::unique_ptr reader = + createNestedMapMemReader(memStream, stripesToPrefetch, columnsToPrefetch, prefetchTwice); // select all of single_map and only the outermost offsets of nested_map. verifySelection(reader, {{2, ReadIntent_ALL}, {5, ReadIntent_OFFSETS}}, {0, 2, 3, 4, 5}); } - TEST(TestReadIntent, testMapConflictingIntent) { + TEST_P(TestReadIntentFromNestedMap, testMapConflictingIntent) { + const auto& params = GetParam(); + const std::vector& stripesToPrefetch = std::get<0>(params); + const std::list& columnsToPrefetch = std::get<1>(params); + bool prefetchTwice = std::get<2>(params); + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); - std::unique_ptr reader = createNestedMapMemReader(memStream); + std::unique_ptr reader = + createNestedMapMemReader(memStream, stripesToPrefetch, columnsToPrefetch, prefetchTwice); // test conflicting ReadIntent on nested_map. verifySelection(reader, {{5, ReadIntent_OFFSETS}, {9, ReadIntent_ALL}}, {0, 5, 7, 9, 10, 11}); @@ -432,9 +529,15 @@ namespace orc { {0, 5, 7, 8, 9, 10, 11}); } - TEST(TestReadIntent, testMapRowBatchContent) { + TEST_P(TestReadIntentFromNestedMap, testMapRowBatchContent) { + const auto& params = GetParam(); + const std::vector& stripesToPrefetch = std::get<0>(params); + const std::list& columnsToPrefetch = std::get<1>(params); + bool prefetchTwice = std::get<2>(params); + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); - std::unique_ptr reader = createNestedMapMemReader(memStream); + std::unique_ptr reader = + createNestedMapMemReader(memStream, stripesToPrefetch, columnsToPrefetch, prefetchTwice); // select all of single_map and only the offsets of nested_map. RowReaderOptions::IdReadIntentMap idReadIntentMap = {{2, ReadIntent_ALL}, @@ -480,7 +583,24 @@ namespace orc { EXPECT_EQ(nullptr, nestedMapBatch.elements); } - std::unique_ptr createNestedUnionMemReader(MemoryOutputStream& memStream) { + INSTANTIATE_TEST_SUITE_P( + TestReadIntentFromNestedMapInstance, TestReadIntentFromNestedMap, + ::testing::Values( + std::make_tuple(std::vector{}, std::list{}, true), + std::make_tuple(std::vector{}, std::list{}, false), + std::make_tuple(std::vector{}, std::list{1, 5}, true), + std::make_tuple(std::vector{}, std::list{1, 5}, false), + std::make_tuple(std::vector{0}, std::list{}, true), + std::make_tuple(std::vector{0}, std::list{}, false), + std::make_tuple(std::vector{0}, std::list{1, 5}, true), + std::make_tuple(std::vector{0}, std::list{1, 5}, false), + std::make_tuple(std::vector{1000}, std::list{1000}, true), + std::make_tuple(std::vector{1000}, std::list{1000}, false))); + + std::unique_ptr createNestedUnionMemReader(MemoryOutputStream& memStream, + const std::vector& stripesToPrefetch, + const std::list& columnsToPrefetch, + bool prefetchTwice) { MemoryPool* pool = getDefaultPool(); auto type = std::unique_ptr( @@ -492,6 +612,7 @@ namespace orc { WriterOptions options; options.setStripeSize(1024 * 1024) .setCompressionBlockSize(1024) + .setMemoryBlockSize(64) .setCompression(CompressionKind_NONE) .setMemoryPool(pool) .setRowIndexStride(1000); @@ -563,20 +684,43 @@ namespace orc { ReaderOptions readerOptions; readerOptions.setMemoryPool(*pool); readerOptions.setReaderMetrics(nullptr); - return createReader(std::move(inStream), readerOptions); + auto reader = createReader(std::move(inStream), readerOptions); + + reader->preBuffer(stripesToPrefetch, columnsToPrefetch); + if (prefetchTwice) { + reader->preBuffer(stripesToPrefetch, columnsToPrefetch); + } + + return reader; } - TEST(TestReadIntent, testUnionAll) { + class TestReadIntentFromNestedUnion + : public ::testing::TestWithParam< + std::tuple, std::list, bool>> {}; + + TEST_P(TestReadIntentFromNestedUnion, testUnionAll) { + const auto& params = GetParam(); + const std::vector& stripesToPrefetch = std::get<0>(params); + const std::list& columnsToPrefetch = std::get<1>(params); + bool prefetchTwice = std::get<2>(params); + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); - std::unique_ptr reader = createNestedUnionMemReader(memStream); + std::unique_ptr reader = + createNestedUnionMemReader(memStream, stripesToPrefetch, columnsToPrefetch, prefetchTwice); // select all of single_union. verifySelection(reader, {{2, ReadIntent_ALL}}, {0, 2, 3, 4}); } - TEST(TestReadIntent, testUnionOffsets) { + TEST_P(TestReadIntentFromNestedUnion, testUnionOffsets) { + const auto& params = GetParam(); + const std::vector& stripesToPrefetch = std::get<0>(params); + const std::list& columnsToPrefetch = std::get<1>(params); + bool prefetchTwice = std::get<2>(params); + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); - std::unique_ptr reader = createNestedUnionMemReader(memStream); + std::unique_ptr reader = + createNestedUnionMemReader(memStream, stripesToPrefetch, columnsToPrefetch, prefetchTwice); // select only the offsets of single_union. verifySelection(reader, {{2, ReadIntent_OFFSETS}}, {0, 2}); @@ -589,17 +733,29 @@ namespace orc { {0, 2, 5, 6, 7, 8, 11}); } - TEST(TestReadIntent, testUnionAllAndOffsets) { + TEST_P(TestReadIntentFromNestedUnion, testUnionAllAndOffsets) { + const auto& params = GetParam(); + const std::vector& stripesToPrefetch = std::get<0>(params); + const std::list& columnsToPrefetch = std::get<1>(params); + bool prefetchTwice = std::get<2>(params); + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); - std::unique_ptr reader = createNestedUnionMemReader(memStream); + std::unique_ptr reader = + createNestedUnionMemReader(memStream, stripesToPrefetch, columnsToPrefetch, prefetchTwice); // select all of single_union and only the outermost offsets of nested_union. verifySelection(reader, {{2, ReadIntent_ALL}, {5, ReadIntent_OFFSETS}}, {0, 2, 3, 4, 5}); } - TEST(TestReadIntent, testUnionConflictingIntent) { + TEST_P(TestReadIntentFromNestedUnion, testUnionConflictingIntent) { + const auto& params = GetParam(); + const std::vector& stripesToPrefetch = std::get<0>(params); + const std::list& columnsToPrefetch = std::get<1>(params); + bool prefetchTwice = std::get<2>(params); + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); - std::unique_ptr reader = createNestedUnionMemReader(memStream); + std::unique_ptr reader = + createNestedUnionMemReader(memStream, stripesToPrefetch, columnsToPrefetch, prefetchTwice); // test conflicting ReadIntent on nested_union. verifySelection(reader, {{5, ReadIntent_OFFSETS}, {8, ReadIntent_ALL}}, @@ -610,9 +766,15 @@ namespace orc { {0, 5, 6, 7, 8, 9, 10, 11}); } - TEST(TestReadIntent, testUnionRowBatchContent) { + TEST_P(TestReadIntentFromNestedUnion, testUnionRowBatchContent) { + const auto& params = GetParam(); + const std::vector& stripesToPrefetch = std::get<0>(params); + const std::list& columnsToPrefetch = std::get<1>(params); + bool prefetchTwice = std::get<2>(params); + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); - std::unique_ptr reader = createNestedUnionMemReader(memStream); + std::unique_ptr reader = + createNestedUnionMemReader(memStream, stripesToPrefetch, columnsToPrefetch, prefetchTwice); // select all of single_union and only the offsets of nested_union. RowReaderOptions::IdReadIntentMap idReadIntentMap = {{2, ReadIntent_ALL}, @@ -662,10 +824,25 @@ namespace orc { EXPECT_EQ(1, nestedUnionBatch.offsets.data()[1]); } + INSTANTIATE_TEST_SUITE_P( + TestReadIntentFromNestedUnionInstance, TestReadIntentFromNestedUnion, + ::testing::Values( + std::make_tuple(std::vector{}, std::list{}, true), + std::make_tuple(std::vector{}, std::list{}, false), + std::make_tuple(std::vector{}, std::list{1, 2}, true), + std::make_tuple(std::vector{}, std::list{1, 2}, false), + std::make_tuple(std::vector{0}, std::list{}, true), + std::make_tuple(std::vector{0}, std::list{}, false), + std::make_tuple(std::vector{0}, std::list{1, 2}, true), + std::make_tuple(std::vector{0}, std::list{1, 2}, false), + std::make_tuple(std::vector{1000}, std::list{1000}, true), + std::make_tuple(std::vector{1000}, std::list{1000}, false))); + TEST(TestReadIntent, testSeekOverEmptyPresentStream) { MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); MemoryPool* pool = getDefaultPool(); uint64_t rowCount = 5000; + { auto type = std::unique_ptr( Type::buildTypeFromString("struct,col3:struct," @@ -673,6 +850,7 @@ namespace orc { WriterOptions options; options.setStripeSize(1024 * 1024) .setCompressionBlockSize(1024) + .setMemoryBlockSize(64) .setCompression(CompressionKind_NONE) .setMemoryPool(pool) .setRowIndexStride(1000); diff --git a/c++/test/TestRleEncoder.cc b/c++/test/TestRleEncoder.cc index 1c24a69515..c69fc9cabc 100644 --- a/c++/test/TestRleEncoder.cc +++ b/c++/test/TestRleEncoder.cc @@ -84,8 +84,8 @@ namespace orc { std::make_unique(memStream.getData(), memStream.getLength()), isSinged, version, *getDefaultPool(), getDefaultReaderMetrics()); - int64_t* decodedData = new int64_t[numValues]; - decoder->next(decodedData, numValues, notNull); + std::vector decodedData(numValues); + decoder->next(decodedData.data(), numValues, notNull); for (uint64_t i = 0; i < numValues; ++i) { if (!notNull || notNull[i]) { @@ -93,7 +93,12 @@ namespace orc { } } - delete[] decodedData; + decoder->next(decodedData.data(), numValues, notNull); + for (uint64_t i = 0; i < numValues; ++i) { + if (!notNull || notNull[i]) { + EXPECT_EQ(data[i], decodedData[i]); + } + } } std::unique_ptr RleTest::getEncoder(RleVersion version, MemoryOutputStream& memStream, @@ -128,6 +133,9 @@ namespace orc { char* notNull = numNulls == 0 ? nullptr : new char[numValues]; int64_t* data = new int64_t[numValues]; generateData(numValues, start, delta, random, data, numNulls, notNull); + encoder->add(data, numValues, notNull); + encoder->finishEncode(); + encoder->add(data, numValues, notNull); encoder->flush(); @@ -243,6 +251,9 @@ namespace orc { MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); std::unique_ptr encoder = getEncoder(RleVersion_2, memStream, isSigned); + encoder->add(data, numValues, nullptr); + encoder->finishEncode(); + encoder->add(data, numValues, nullptr); encoder->flush(); @@ -274,5 +285,43 @@ namespace orc { runExampleTest(data, 9, expectedEncoded, 13); } + TEST_P(RleTest, RleV2_value_limit_test) { + std::vector inputData = {-9007199254740992l, + -8725724278030337l, + -1125762467889153l, + -1l, + -9007199254740992l, + -9007199254740992l, + -497l, + 127l, + -1l, + -72057594037927936l, + -4194304l, + -9007199254740992l, + -4503599593816065l, + -4194304l, + -8936830510563329l, + -9007199254740992l, + -1l, + -70334384439312l, + -4063233l, + -6755399441973249l}; + int numValues = inputData.size(); + + // Invoke the encoder. + const bool isSigned = true; + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); + + std::unique_ptr encoder = getEncoder(RleVersion_2, memStream, isSigned); + encoder->add(inputData.data(), numValues, nullptr); + encoder->finishEncode(); + + encoder->add(inputData.data(), numValues, nullptr); + encoder->flush(); + + // Decode and verify. + decodeAndVerify(RleVersion_2, memStream, inputData.data(), numValues, nullptr, isSigned); + } + INSTANTIATE_TEST_SUITE_P(OrcTest, RleTest, Values(true, false)); } // namespace orc diff --git a/c++/test/TestSchemaEvolution.cc b/c++/test/TestSchemaEvolution.cc index c52ba009fa..d146853573 100644 --- a/c++/test/TestSchemaEvolution.cc +++ b/c++/test/TestSchemaEvolution.cc @@ -45,17 +45,17 @@ namespace orc { directEncoding.set_kind(proto::ColumnEncoding_Kind_DIRECT); EXPECT_CALL(streams, getEncoding(testing::_)).WillRepeatedly(testing::Return(directEncoding)); - EXPECT_CALL(streams, getStreamProxy(testing::_, testing::_, testing::_)) - .WillRepeatedly(testing::Return(nullptr)); - std::string dummyStream("dummy"); - ON_CALL(streams, getStreamProxy(1, proto::Stream_Kind_SECONDARY, testing::_)) - .WillByDefault(testing::Return( - new SeekableArrayInputStream(dummyStream.c_str(), dummyStream.length()))); + EXPECT_CALL(streams, getStreamProxy(testing::_, testing::_, testing::_)) + .WillRepeatedly(testing::ReturnNew(dummyStream.c_str(), + dummyStream.length())); + EXPECT_CALL(streams, isDecimalAsLong()).WillRepeatedly(testing::Return(false)); EXPECT_CALL(streams, getSchemaEvolution()).WillRepeatedly(testing::Return(&se)); + EXPECT_CALL(streams, getSelectedColumns()) + .WillRepeatedly(testing::Return(std::vector{true, true})); - EXPECT_TRUE(buildReader(*fileType, streams) != nullptr); + EXPECT_TRUE(buildReader(*fileType, streams, true) != nullptr); } return true; } @@ -66,8 +66,8 @@ namespace orc { {2, "struct"}, {3, "struct"}, {4, "struct"}, {5, "struct"}, {6, "struct"}, {7, "struct"}, - {8, "struct"}, {9, "struct"}, - {10, "struct"}, {11, "struct"}, + {8, "struct"}, {9, "struct"}, + {10, "struct"}, {11, "struct"}, {12, "struct"}, {13, "struct"}, {14, "struct"}, {15, "struct"}, {16, "struct"}}; @@ -148,6 +148,38 @@ namespace orc { } } + // conversion from string variant to numeric + for (size_t i = 7; i <= 11; i++) { + for (size_t j = 0; j <= 6; j++) { + canConvert[i][j] = true; + needConvert[i][j] = true; + } + } + + // conversion from string variant to string variant + for (size_t i = 7; i <= 11; i++) { + for (size_t j = 7; j <= 11; j++) { + canConvert[i][j] = true; + needConvert[i][j] = (i != j); + } + } + + // conversion from string variant to decimal + for (size_t i = 7; i <= 11; i++) { + for (size_t j = 12; j <= 13; j++) { + canConvert[i][j] = true; + needConvert[i][j] = (i != j); + } + } + + // conversion from string variant to timestamp + for (size_t i = 7; i <= 11; i++) { + for (size_t j = 14; j <= 15; j++) { + canConvert[i][j] = true; + needConvert[i][j] = (i != j); + } + } + for (size_t i = 0; i < typesSize; i++) { for (size_t j = 0; j < typesSize; j++) { testConvertReader(types[i], types[j], canConvert[i][j], needConvert[i][j]); diff --git a/c++/test/TestSearchArgument.cc b/c++/test/TestSearchArgument.cc index bf9b82ea5c..09904139cb 100644 --- a/c++/test/TestSearchArgument.cc +++ b/c++/test/TestSearchArgument.cc @@ -481,4 +481,45 @@ namespace orc { std::invalid_argument); } + TEST(TestSearchArgument, testBadTreeNode) { + auto invalidNode = std::make_shared(ExpressionTree::Operator::NOT, NodeList{}); + EXPECT_THROW(invalidNode->toString(), std::invalid_argument); + + std::vector leaves; + leaves.push_back(TruthValue::YES); + EXPECT_THROW(invalidNode->evaluate(leaves), std::invalid_argument); + } + + TEST(TestSearchArgument, testMaybe) { + auto expectedSarg = + SearchArgumentFactory::newBuilder() + ->startNot() + .startOr() + .isNull("x", PredicateDataType::LONG) + .between("y", PredicateDataType::DECIMAL, Literal(10, 3, 0), Literal(200, 3, 1)) + .in("z", PredicateDataType::LONG, + {Literal(static_cast(1)), Literal(static_cast(2)), + Literal(static_cast(3))}) + .nullSafeEquals("a", PredicateDataType::STRING, Literal("stinger", 7)) + .end() + .end() + .build(); + + auto sargWithMaybe = + SearchArgumentFactory::newBuilder() + ->startNot() + .startOr() + .isNull("x", PredicateDataType::LONG) + .between("y", PredicateDataType::DECIMAL, Literal(10, 3, 0), Literal(200, 3, 1)) + .in("z", PredicateDataType::LONG, + {Literal(static_cast(1)), Literal(static_cast(2)), + Literal(static_cast(3))}) + .maybe() + .nullSafeEquals("a", PredicateDataType::STRING, Literal("stinger", 7)) + .end() + .end() + .build(); + EXPECT_EQ(expectedSarg->toString(), sargWithMaybe->toString()); + } + } // namespace orc diff --git a/c++/test/TestStatistics.cc b/c++/test/TestStatistics.cc new file mode 100644 index 0000000000..61c5e08cb6 --- /dev/null +++ b/c++/test/TestStatistics.cc @@ -0,0 +1,230 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "orc/OrcFile.hh" + +#include "MemoryInputStream.hh" +#include "MemoryOutputStream.hh" +#include "TestUtil.hh" + +#include "wrap/gtest-wrapper.h" + +#include +#include +#include + +namespace orc { + +#define ENSURE_DYNAMIC_CAST_NOT_NULL(PTR) \ + if (PTR == NULL) throw std::logic_error("dynamic_cast returns null"); + + const int DEFAULT_MEM_STREAM_SIZE = 1024 * 1024; // 1M + + static std::unique_ptr createWriter(uint64_t stripeSize, const Type& type, + MemoryPool* memoryPool, OutputStream* stream) { + WriterOptions options; + options.setStripeSize(stripeSize); + options.setCompressionBlockSize(256); + options.setMemoryBlockSize(256); + options.setCompression(CompressionKind_ZLIB); + options.setMemoryPool(memoryPool); + options.setRowIndexStride(10); + return createWriter(type, stream, options); + } + + static std::unique_ptr createReader(MemoryPool* memoryPool, + MemoryOutputStream& memStream) { + std::unique_ptr inStream( + new MemoryInputStream(memStream.getData(), memStream.getLength())); + ReaderOptions options; + options.setMemoryPool(*memoryPool); + return createReader(std::move(inStream), options); + } + + TEST(Statistics, geometryStatsWithNull) { + std::unique_ptr const type(Type::buildTypeFromString("struct")); + + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); + MemoryPool* const pool = getDefaultPool(); + uint64_t const stripeSize = 32; // small stripe size to garantee multi stripes + std::unique_ptr writer = createWriter(stripeSize, *type, pool, &memStream); + + uint64_t const batchCount = 1000; + uint64_t const batches = 10; + std::unique_ptr const batch = writer->createRowBatch(batchCount); + StructVectorBatch* structBatch = dynamic_cast(batch.get()); + ENSURE_DYNAMIC_CAST_NOT_NULL(structBatch); + + StringVectorBatch* strBatch = dynamic_cast(structBatch->fields[0]); + ENSURE_DYNAMIC_CAST_NOT_NULL(strBatch); + + // create str values + std::vector wkbs; + std::array mins = {geospatial::INF, geospatial::INF, geospatial::INF, + geospatial::INF}; + std::array maxs = {-geospatial::INF, -geospatial::INF, -geospatial::INF, + -geospatial::INF}; + for (uint64_t i = 1; i < batchCount - 1; ++i) { + if (i % 3 == 0) { + wkbs.push_back(MakeWKBPoint({i * 1.0, i * 1.0}, false, false)); + mins[0] = std::min(mins[0], i * 1.0); + maxs[0] = std::max(maxs[0], i * 1.0); + mins[1] = std::min(mins[1], i * 1.0); + maxs[1] = std::max(maxs[1], i * 1.0); + } else if (i % 3 == 1) { + wkbs.push_back(MakeWKBPoint({i * 1.0, i * 1.0, i * 1.0}, true, false)); + mins[0] = std::min(mins[0], i * 1.0); + maxs[0] = std::max(maxs[0], i * 1.0); + mins[1] = std::min(mins[1], i * 1.0); + maxs[1] = std::max(maxs[1], i * 1.0); + mins[2] = std::min(mins[2], i * 1.0); + maxs[2] = std::max(maxs[2], i * 1.0); + } else if (i % 3 == 2) { + wkbs.push_back(MakeWKBPoint({i * 1.0, i * 1.0, i * 1.0, i * 1.0}, true, true)); + mins[0] = std::min(mins[0], i * 1.0); + maxs[0] = std::max(maxs[0], i * 1.0); + mins[1] = std::min(mins[1], i * 1.0); + maxs[1] = std::max(maxs[1], i * 1.0); + mins[2] = std::min(mins[2], i * 1.0); + maxs[2] = std::max(maxs[2], i * 1.0); + mins[3] = std::min(mins[3], i * 1.0); + maxs[3] = std::max(maxs[3], i * 1.0); + } + } + for (uint64_t i = 1; i < batchCount - 1; ++i) { + strBatch->data[i] = const_cast(wkbs[i - 1].c_str()); + strBatch->length[i] = static_cast(wkbs[i - 1].length()); + } + + structBatch->numElements = batchCount; + strBatch->numElements = batchCount; + + structBatch->hasNulls = true; + structBatch->notNull[0] = '\0'; + structBatch->notNull[batchCount - 1] = '\0'; + strBatch->hasNulls = true; + strBatch->notNull[0] = '\0'; + strBatch->notNull[batchCount - 1] = '\0'; + + for (uint64_t i = 0; i < batches; ++i) { + writer->add(*batch.get()); + } + writer->close(); + + std::unique_ptr reader = createReader(pool, memStream); + + // check column 1 (string) file stats + auto stats1 = reader->getColumnStatistics(1); + const GeospatialColumnStatistics* geoFileStats = + dynamic_cast(stats1.get()); + ENSURE_DYNAMIC_CAST_NOT_NULL(geoFileStats); + EXPECT_EQ(geoFileStats->getGeospatialTypes().size(), 3); + EXPECT_EQ(geoFileStats->getGeospatialTypes()[0], 1); + EXPECT_EQ(geoFileStats->getGeospatialTypes()[1], 1001); + EXPECT_EQ(geoFileStats->getGeospatialTypes()[2], 3001); + std::array expectValid = {true, true, true, true}; + std::array expectEmpty = {false, false, false, false}; + EXPECT_EQ(geoFileStats->getBoundingBox().dimensionValid(), expectValid); + EXPECT_EQ(geoFileStats->getBoundingBox().dimensionEmpty(), expectEmpty); + EXPECT_EQ(geoFileStats->getBoundingBox().lowerBound(), mins); + EXPECT_EQ(geoFileStats->getBoundingBox().upperBound(), maxs); + } + + TEST(Statistics, geographyStatsWithNull) { + std::unique_ptr const type( + Type::buildTypeFromString("struct")); + + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); + MemoryPool* const pool = getDefaultPool(); + uint64_t const stripeSize = 32; // small stripe size to garantee multi stripes + std::unique_ptr writer = createWriter(stripeSize, *type, pool, &memStream); + + uint64_t const batchCount = 1000; + uint64_t const batches = 10; + std::unique_ptr const batch = writer->createRowBatch(batchCount); + StructVectorBatch* structBatch = dynamic_cast(batch.get()); + ENSURE_DYNAMIC_CAST_NOT_NULL(structBatch); + + StringVectorBatch* strBatch = dynamic_cast(structBatch->fields[0]); + ENSURE_DYNAMIC_CAST_NOT_NULL(strBatch); + + // create str values + std::vector wkbs; + std::array mins = {geospatial::INF, geospatial::INF, geospatial::INF, + geospatial::INF}; + std::array maxs = {-geospatial::INF, -geospatial::INF, -geospatial::INF, + -geospatial::INF}; + for (uint64_t i = 1; i < batchCount - 1; ++i) { + if (i % 3 == 0) { + wkbs.push_back(MakeWKBPoint({i * 1.0, i * 1.0}, false, false)); + mins[0] = std::min(mins[0], i * 1.0); + maxs[0] = std::max(maxs[0], i * 1.0); + mins[1] = std::min(mins[1], i * 1.0); + maxs[1] = std::max(maxs[1], i * 1.0); + } else if (i % 3 == 1) { + wkbs.push_back(MakeWKBPoint({i * 1.0, i * 1.0, i * 1.0}, true, false)); + mins[0] = std::min(mins[0], i * 1.0); + maxs[0] = std::max(maxs[0], i * 1.0); + mins[1] = std::min(mins[1], i * 1.0); + maxs[1] = std::max(maxs[1], i * 1.0); + mins[2] = std::min(mins[2], i * 1.0); + maxs[2] = std::max(maxs[2], i * 1.0); + } else if (i % 3 == 2) { + wkbs.push_back(MakeWKBPoint({i * 1.0, i * 1.0, i * 1.0, i * 1.0}, true, true)); + mins[0] = std::min(mins[0], i * 1.0); + maxs[0] = std::max(maxs[0], i * 1.0); + mins[1] = std::min(mins[1], i * 1.0); + maxs[1] = std::max(maxs[1], i * 1.0); + mins[2] = std::min(mins[2], i * 1.0); + maxs[2] = std::max(maxs[2], i * 1.0); + mins[3] = std::min(mins[3], i * 1.0); + maxs[3] = std::max(maxs[3], i * 1.0); + } + } + for (uint64_t i = 1; i < batchCount - 1; ++i) { + strBatch->data[i] = const_cast(wkbs[i - 1].c_str()); + strBatch->length[i] = static_cast(wkbs[i - 1].length()); + } + + structBatch->numElements = batchCount; + strBatch->numElements = batchCount; + + structBatch->hasNulls = true; + structBatch->notNull[0] = '\0'; + structBatch->notNull[batchCount - 1] = '\0'; + strBatch->hasNulls = true; + strBatch->notNull[0] = '\0'; + strBatch->notNull[batchCount - 1] = '\0'; + + for (uint64_t i = 0; i < batches; ++i) { + writer->add(*batch.get()); + } + writer->close(); + + std::unique_ptr reader = createReader(pool, memStream); + + // check column 1 (string) file stats + auto stats1 = reader->getColumnStatistics(1); + const GeospatialColumnStatistics* geoFileStats = + dynamic_cast(stats1.get()); + ENSURE_DYNAMIC_CAST_NOT_NULL(geoFileStats); + EXPECT_EQ(geoFileStats->getGeospatialTypes().size(), 0); + std::array expectValid = {false, false, false, false}; + EXPECT_EQ(geoFileStats->getBoundingBox().dimensionValid(), expectValid); + } +} // namespace orc \ No newline at end of file diff --git a/c++/test/TestStripeIndexStatistics.cc b/c++/test/TestStripeIndexStatistics.cc index 34a4649c35..a529792c17 100644 --- a/c++/test/TestStripeIndexStatistics.cc +++ b/c++/test/TestStripeIndexStatistics.cc @@ -46,18 +46,19 @@ namespace orc { intColStats = reinterpret_cast( stripeStats->getRowIndexStatistics(1, 0)); EXPECT_EQ( - "Data type: Integer\nValues: 2000\nHas null: no\nMinimum: 1\nMaximum: 2000\nSum: 2001000\n", + "Data type: Integer\nValues: 2000\nHas null: yes\nMinimum: 1\nMaximum: 2000\nSum: " + "2001000\n", intColStats->toString()); intColStats = reinterpret_cast( stripeStats->getRowIndexStatistics(1, 1)); EXPECT_EQ( - "Data type: Integer\nValues: 2000\nHas null: no\nMinimum: 2001\nMaximum: 4000\nSum: " + "Data type: Integer\nValues: 2000\nHas null: yes\nMinimum: 2001\nMaximum: 4000\nSum: " "6001000\n", intColStats->toString()); intColStats = reinterpret_cast( stripeStats->getRowIndexStatistics(1, 2)); EXPECT_EQ( - "Data type: Integer\nValues: 2000\nHas null: no\nMinimum: 4001\nMaximum: 6000\nSum: " + "Data type: Integer\nValues: 2000\nHas null: yes\nMinimum: 4001\nMaximum: 6000\nSum: " "10001000\n", intColStats->toString()); @@ -65,23 +66,48 @@ namespace orc { stringColStats = reinterpret_cast( stripeStats->getRowIndexStatistics(2, 0)); EXPECT_EQ( - "Data type: String\nValues: 2000\nHas null: no\nMinimum: 1000\nMaximum: 9a\nTotal length: " + "Data type: String\nValues: 2000\nHas null: yes\nMinimum: 1000\nMaximum: 9a\nTotal length: " "7892\n", stringColStats->toString()); stringColStats = reinterpret_cast( stripeStats->getRowIndexStatistics(2, 1)); EXPECT_EQ( - "Data type: String\nValues: 2000\nHas null: no\nMinimum: 2001\nMaximum: 4000\nTotal " + "Data type: String\nValues: 2000\nHas null: yes\nMinimum: 2001\nMaximum: 4000\nTotal " "length: " "8000\n", stringColStats->toString()); stringColStats = reinterpret_cast( stripeStats->getRowIndexStatistics(2, 2)); EXPECT_EQ( - "Data type: String\nValues: 2000\nHas null: no\nMinimum: 4001\nMaximum: 6000\nTotal " + "Data type: String\nValues: 2000\nHas null: yes\nMinimum: 4001\nMaximum: 6000\nTotal " "length: " "8000\n", stringColStats->toString()); + + std::unique_ptr stripeLevelStats = reader->getStripeStatistics(0, false); + const orc::IntegerColumnStatistics* stripeLevelIntColStats; + stripeLevelIntColStats = reinterpret_cast( + stripeLevelStats->getColumnStatistics(1)); + EXPECT_EQ( + "Data type: Integer\nValues: 6000\nHas null: yes\nMinimum: 1\nMaximum: 6000\nSum: " + "18003000\n", + stripeLevelIntColStats->toString()); + + const orc::StringColumnStatistics* stripeLevelStringColStats; + stripeLevelStringColStats = reinterpret_cast( + stripeLevelStats->getColumnStatistics(2)); + EXPECT_EQ( + "Data type: String\nValues: 6000\nHas null: yes\nMinimum: 1000\nMaximum: 9a\nTotal length: " + "23892\n", + stripeLevelStringColStats->toString()); + + intColStats = + reinterpret_cast(stripeStats->getColumnStatistics(1)); + stringColStats = + reinterpret_cast(stripeStats->getColumnStatistics(2)); + + EXPECT_EQ(intColStats->toString(), stripeLevelIntColStats->toString()); + EXPECT_EQ(stringColStats->toString(), stripeLevelStringColStats->toString()); } } // namespace orc diff --git a/c++/test/TestTimestampStatistics.cc b/c++/test/TestTimestampStatistics.cc index d20a049557..e005fa6cf6 100644 --- a/c++/test/TestTimestampStatistics.cc +++ b/c++/test/TestTimestampStatistics.cc @@ -68,6 +68,19 @@ namespace orc { "00:00:00.688\nLowerBound: 1995-01-01 00:00:00.688\nMaximum: 2037-01-01 " "00:00:00.0\nUpperBound: 2037-01-01 00:00:00.1\n", stripeColStats->toString()); + + std::unique_ptr stripeStatsWithOutRowIndex = + reader->getStripeStatistics(0, false); + const orc::TimestampColumnStatistics* stripeColStatsOnly = + reinterpret_cast( + stripeStatsWithOutRowIndex->getColumnStatistics(0)); + + EXPECT_TRUE(stripeColStatsOnly->hasMinimum()); + EXPECT_TRUE(stripeColStatsOnly->hasMaximum()); + EXPECT_EQ(stripeColStats->toString(), stripeColStatsOnly->toString()); + EXPECT_EQ(stripeStats->getNumberOfColumns(), stripeStatsWithOutRowIndex->getNumberOfColumns()); + EXPECT_THROW(stripeStatsWithOutRowIndex->getRowIndexStatistics(1, 1), NotImplementedYet); + EXPECT_THROW(stripeStatsWithOutRowIndex->getNumberOfRowIndexStats(1), NotImplementedYet); } TEST(TestTimestampStatistics, testTimezoneUTC) { diff --git a/c++/test/TestTimezone.cc b/c++/test/TestTimezone.cc index 2330fcfb04..94895cd700 100644 --- a/c++/test/TestTimezone.cc +++ b/c++/test/TestTimezone.cc @@ -21,6 +21,7 @@ #include "wrap/gmock.h" #include "wrap/gtest-wrapper.h" +#include #include #include #include @@ -421,20 +422,61 @@ namespace orc { } TEST(TestTimezone, testMissingTZDB) { - const char* tzDirBackup = std::getenv("TZDIR"); - if (tzDirBackup != nullptr) { + const char* tzDir = std::getenv("TZDIR"); + std::string tzDirBackup; + if (tzDir != nullptr) { + // std::string creates a deepcopy of buffer, which avoids that + // unsetting environment variable wrecks pointer to tzDir + tzDirBackup = tzDir; ASSERT_TRUE(delEnv("TZDIR")); } ASSERT_TRUE(setEnv("TZDIR", "/path/to/wrong/tzdb")); - EXPECT_THAT([]() { getTimezoneByName("America/Los_Angeles"); }, + EXPECT_THAT([]() { getTimezoneByName("America/Los_Angeles").getVersion(); }, testing::ThrowsMessage(testing::HasSubstr( "Time zone file /path/to/wrong/tzdb/America/Los_Angeles does not exist." " Please install IANA time zone database and set TZDIR env."))); - if (tzDirBackup != nullptr) { - ASSERT_TRUE(setEnv("TZDIR", tzDirBackup)); + if (!tzDirBackup.empty()) { + ASSERT_TRUE(setEnv("TZDIR", tzDirBackup.c_str())); } else { ASSERT_TRUE(delEnv("TZDIR")); } } + TEST(TestTimezone, testTzdbFromCondaEnv) { + const char* tzDir = std::getenv("TZDIR"); + // test only makes sense if TZDIR exists + if (tzDir != nullptr) { + std::string tzDirBackup = tzDir; + ASSERT_TRUE(delEnv("TZDIR")); + + // remove "/share/zoneinfo" from TZDIR (as set through TZDATA_DIR in CI) to + // get the equivalent of CONDA_PREFIX, relative to the location of the tzdb + std::string condaPrefix(tzDirBackup); + condaPrefix += "/../.."; + ASSERT_TRUE(setEnv("CONDA_PREFIX", condaPrefix.c_str())); + + // small test sample to ensure tzbd loads with CONDA_PREFIX, even without TZDIR + const Timezone* zrh = &getTimezoneByName("Europe/Zurich"); + EXPECT_EQ("CET", getVariantFromZone(*zrh, "2024-03-31 00:59:59")); + EXPECT_EQ("CEST", getVariantFromZone(*zrh, "2024-03-31 01:00:00")); + EXPECT_EQ("CEST", getVariantFromZone(*zrh, "2024-10-27 00:59:59")); + EXPECT_EQ("CET", getVariantFromZone(*zrh, "2024-10-27 01:00:00")); + + // CONDA_PREFIX contains backslashes on windows; test that this doesn't blow up + std::replace(condaPrefix.begin(), condaPrefix.end(), '/', '\\'); + ASSERT_TRUE(setEnv("CONDA_PREFIX", condaPrefix.c_str())); + + // as above, but different timezone to avoid hitting cache + const Timezone* syd = &getTimezoneByName("Australia/Sydney"); + EXPECT_EQ("AEDT", getVariantFromZone(*syd, "2024-04-06 15:59:59")); + EXPECT_EQ("AEST", getVariantFromZone(*syd, "2024-04-06 16:00:00")); + EXPECT_EQ("AEST", getVariantFromZone(*syd, "2024-10-05 15:59:59")); + EXPECT_EQ("AEDT", getVariantFromZone(*syd, "2024-10-05 16:00:00")); + + // restore state of environment variables + ASSERT_TRUE(delEnv("CONDA_PREFIX")); + ASSERT_TRUE(setEnv("TZDIR", tzDirBackup.c_str())); + } + } + } // namespace orc diff --git a/c++/test/TestType.cc b/c++/test/TestType.cc index c9ac2f2850..cec0d8d2c4 100644 --- a/c++/test/TestType.cc +++ b/c++/test/TestType.cc @@ -325,7 +325,7 @@ namespace orc { expectLogicErrorDuringParse("int<>", "Invalid < after int type."); expectLogicErrorDuringParse("array(int)", "Missing < after array."); expectLogicErrorDuringParse("struct>", - "Invalid struct type. No field name set."); + "Invalid struct type. Field name can not contain '<'."); expectLogicErrorDuringParse("struct", "Missing comma after field."); } diff --git a/c++/test/TestUtil.cc b/c++/test/TestUtil.cc new file mode 100644 index 0000000000..a76880340c --- /dev/null +++ b/c++/test/TestUtil.cc @@ -0,0 +1,65 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "TestUtil.hh" +#include +#include + +namespace orc { + uint32_t GeometryTypeToWKB(geospatial::GeometryType geometryType, bool hasZ, bool hasM) { + auto wkbGeomType = static_cast(geometryType); + + if (hasZ) { + wkbGeomType += 1000; + } + + if (hasM) { + wkbGeomType += 2000; + } + + return wkbGeomType; + } + + std::string MakeWKBPoint(const std::vector& xyzm, bool hasZ, bool hasM) { + // 1:endianness + 4:type + 8:x + 8:y + int numBytes = kWkbPointXYSize + (hasZ ? sizeof(double) : 0) + (hasM ? sizeof(double) : 0); + std::string wkb(numBytes, 0); + char* ptr = wkb.data(); + + ptr[0] = kWkbNativeEndianness; + uint32_t geom_type = GeometryTypeToWKB(geospatial::GeometryType::POINT, hasZ, hasM); + std::memcpy(&ptr[1], &geom_type, 4); + std::memcpy(&ptr[5], &xyzm[0], 8); + std::memcpy(&ptr[13], &xyzm[1], 8); + ptr += 21; + + if (hasZ) { + std::memcpy(ptr, &xyzm[2], 8); + ptr += 8; + } + + if (hasM) { + std::memcpy(ptr, &xyzm[3], 8); + ptr += 8; + } + + assert(static_cast(ptr - wkb.data()) == wkb.length()); + return wkb; + } + +} // namespace orc \ No newline at end of file diff --git a/c++/test/TestUtil.hh b/c++/test/TestUtil.hh new file mode 100644 index 0000000000..104fbc0397 --- /dev/null +++ b/c++/test/TestUtil.hh @@ -0,0 +1,44 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include "orc/Geospatial.hh" + +#include +#include + +namespace orc { + + /// \brief Number of bytes in a WKB Point with X and Y dimensions (uint8_t endian, + /// uint32_t geometry type, 2 * double coordinates) + static constexpr int kWkbPointXYSize = 21; + + static bool isLittleEndian() { + static union { + uint32_t i; + char c[4]; + } num = {0x01020304}; + return num.c[0] == 4; + } + + static uint8_t kWkbNativeEndianness = isLittleEndian() ? 0x01 : 0x00; + + uint32_t GeometryTypeToWKB(geospatial::GeometryType geometryType, bool hasZ, bool hasM); + std::string MakeWKBPoint(const std::vector& xyzm, bool hasZ, bool hasM); + +} // namespace orc \ No newline at end of file diff --git a/c++/test/TestWriter.cc b/c++/test/TestWriter.cc index d160f82ff1..11ba0c9dea 100644 --- a/c++/test/TestWriter.cc +++ b/c++/test/TestWriter.cc @@ -16,18 +16,20 @@ * limitations under the License. */ -#include "orc/ColumnPrinter.hh" +#include #include "orc/OrcFile.hh" #include "MemoryInputStream.hh" #include "MemoryOutputStream.hh" #include "Reader.hh" +#include "TestUtil.hh" #include "wrap/gmock.h" #include "wrap/gtest-wrapper.h" #include #include +#include #include #ifdef __clang__ @@ -41,11 +43,11 @@ namespace orc { const int DEFAULT_MEM_STREAM_SIZE = 100 * 1024 * 1024; // 100M - std::unique_ptr createWriter(uint64_t stripeSize, uint64_t compresionblockSize, - CompressionKind compression, const Type& type, - MemoryPool* memoryPool, OutputStream* stream, - FileVersion version, uint64_t stride = 0, - const std::string& timezone = "GMT", + std::unique_ptr createWriter(uint64_t stripeSize, uint64_t memoryBlockSize, + uint64_t compresionblockSize, CompressionKind compression, + const Type& type, MemoryPool* memoryPool, + OutputStream* stream, FileVersion version, + uint64_t stride = 0, const std::string& timezone = "GMT", bool useTightNumericVector = false) { WriterOptions options; options.setStripeSize(stripeSize); @@ -56,6 +58,9 @@ namespace orc { options.setFileVersion(version); options.setTimezoneName(timezone); options.setUseTightNumericVector(useTightNumericVector); + options.setMemoryBlockSize(memoryBlockSize); + // enable align block bound to row group when stride is not 0 + options.setAlignBlockBoundToRowGroup(true); return createWriter(type, stream, options); } @@ -83,7 +88,56 @@ namespace orc { return reader->createRowReader(rowReaderOpts); } - class WriterTest : public TestWithParam { + void verifyCompressionBlockAlignment(std::unique_ptr& reader, uint64_t columnCount) { + auto stripeCount = reader->getNumberOfStripes(); + for (uint64_t stripeIndex = 0; stripeIndex < stripeCount; ++stripeIndex) { + for (uint64_t i = 0; i < columnCount; ++i) { + auto rowGroupIndexMap = reader->getRowGroupIndex(stripeIndex); + EXPECT_TRUE(rowGroupIndexMap.size() > 0); + auto rowGroupIndex = rowGroupIndexMap[columnCount]; + auto subType = reader->getType().getSubtype(i); + EXPECT_TRUE(rowGroupIndex.positions.size() > 0); + for (auto rowGroupPositions : rowGroupIndex.positions) { + for (uint64_t posIndex = 0; posIndex < rowGroupPositions.size(); ++posIndex) { + // After we call finishStream(), unusedBufferSize is set to 0, + // so only the first position is valid in each recordPosition call. + switch (subType->getKind()) { + case DECIMAL: + case STRING: + case BINARY: + case CHAR: + case VARCHAR: { + if (posIndex != 0 && posIndex != 2) { + EXPECT_EQ(rowGroupPositions[posIndex], 0); + } + break; + } + case TIMESTAMP_INSTANT: + case TIMESTAMP: { + if (posIndex != 0 && posIndex != 3) { + EXPECT_EQ(rowGroupPositions[posIndex], 0); + } + break; + } + default: { + if (posIndex != 0) { + EXPECT_EQ(rowGroupPositions[posIndex], 0); + } + break; + } + } + } + } + } + } + } + + struct TestParams { + FileVersion fileVersion; + bool enableAlignBlockBoundToRowGroup; + }; + + class WriterTest : public TestWithParam { // You can implement all the usual fixture class members here. // To access the test parameter, call GetParam() from class // TestWithParam. @@ -91,13 +145,15 @@ namespace orc { protected: FileVersion fileVersion; + bool enableAlignBlockBoundToRowGroup; public: - WriterTest() : fileVersion(FileVersion::v_0_11()) {} + WriterTest() : fileVersion(FileVersion::v_0_11()), enableAlignBlockBoundToRowGroup(false) {} }; void WriterTest::SetUp() { - fileVersion = GetParam(); + fileVersion = GetParam().fileVersion; + enableAlignBlockBoundToRowGroup = GetParam().enableAlignBlockBoundToRowGroup; } TEST_P(WriterTest, writeEmptyFile) { @@ -107,10 +163,11 @@ namespace orc { uint64_t stripeSize = 16 * 1024; // 16K uint64_t compressionBlockSize = 1024; // 1k + uint64_t memoryBlockSize = 64; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion); writer->close(); auto inStream = std::make_unique(memStream.getData(), memStream.getLength()); @@ -135,10 +192,11 @@ namespace orc { uint64_t stripeSize = 16 * 1024; // 16K uint64_t compressionBlockSize = 1024; // 1k + uint64_t memoryBlockSize = 64; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion); std::unique_ptr batch = writer->createRowBatch(1024); StructVectorBatch* structBatch = dynamic_cast(batch.get()); LongVectorBatch* longBatch = dynamic_cast(structBatch->fields[0]); @@ -195,10 +253,11 @@ namespace orc { uint64_t stripeSize = 1024; // 1K uint64_t compressionBlockSize = 1024; // 1k + uint64_t memoryBlockSize = 64; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion); std::unique_ptr batch = writer->createRowBatch(65535); StructVectorBatch* structBatch = dynamic_cast(batch.get()); LongVectorBatch* longBatch = dynamic_cast(structBatch->fields[0]); @@ -241,13 +300,14 @@ namespace orc { uint64_t stripeSize = 1024; // 1K uint64_t compressionBlockSize = 1024; // 1k + uint64_t memoryBlockSize = 64; char dataBuffer[327675]; uint64_t offset = 0; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion, enableAlignBlockBoundToRowGroup ? 1024 : 0); std::unique_ptr batch = writer->createRowBatch(65535); StructVectorBatch* structBatch = dynamic_cast(batch.get()); StringVectorBatch* strBatch = dynamic_cast(structBatch->fields[0]); @@ -289,6 +349,9 @@ namespace orc { EXPECT_EQ(i, static_cast(atoi(str.c_str()))); EXPECT_EQ(i, static_cast(atoi(bin.c_str()))); } + if (enableAlignBlockBoundToRowGroup) { + verifyCompressionBlockAlignment(reader, type->getSubtypeCount()); + } EXPECT_FALSE(rowReader->next(*batch)); } @@ -301,6 +364,7 @@ namespace orc { uint64_t stripeSize = 16 * 1024; uint64_t compressionBlockSize = 1024; uint64_t rowCount = 655350; + uint64_t memoryBlockSize = 64; std::vector data(rowCount); for (uint64_t i = 0; i < rowCount; ++i) { @@ -308,8 +372,8 @@ namespace orc { } std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion, enableAlignBlockBoundToRowGroup ? 1024 : 0); std::unique_ptr batch = writer->createRowBatch(rowCount); StructVectorBatch* structBatch = dynamic_cast(batch.get()); DoubleVectorBatch* doubleBatch = dynamic_cast(structBatch->fields[0]); @@ -345,6 +409,10 @@ namespace orc { 0.000001f); } EXPECT_FALSE(rowReader->next(*batch)); + + if (enableAlignBlockBoundToRowGroup) { + verifyCompressionBlockAlignment(reader, type->getSubtypeCount()); + } } TEST_P(WriterTest, writeShortIntLong) { @@ -356,10 +424,11 @@ namespace orc { uint64_t stripeSize = 16 * 1024; uint64_t compressionBlockSize = 1024; uint64_t rowCount = 65535; + uint64_t memoryBlockSize = 64; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion, enableAlignBlockBoundToRowGroup ? 1024 : 0); std::unique_ptr batch = writer->createRowBatch(rowCount); StructVectorBatch* structBatch = dynamic_cast(batch.get()); LongVectorBatch* smallIntBatch = dynamic_cast(structBatch->fields[0]); @@ -396,6 +465,9 @@ namespace orc { EXPECT_EQ(static_cast(i), intBatch->data[i]); EXPECT_EQ(static_cast(i), bigIntBatch->data[i]); } + if (enableAlignBlockBoundToRowGroup) { + verifyCompressionBlockAlignment(reader, type->getSubtypeCount()); + } } TEST_P(WriterTest, writeTinyint) { @@ -406,16 +478,20 @@ namespace orc { uint64_t stripeSize = 16 * 1024; uint64_t compressionBlockSize = 1024; uint64_t rowCount = 65535; + uint64_t memoryBlockSize = 64; - std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + std::unique_ptr writer = createWriter( + stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZSTD, *type, pool, + &memStream, fileVersion, enableAlignBlockBoundToRowGroup ? 1024 : 0, "GMT", true); std::unique_ptr batch = writer->createRowBatch(rowCount); StructVectorBatch* structBatch = dynamic_cast(batch.get()); - LongVectorBatch* byteBatch = dynamic_cast(structBatch->fields[0]); + ByteVectorBatch* byteBatch = dynamic_cast(structBatch->fields[0]); + int64_t sum = 0; for (uint64_t i = 0; i < rowCount; ++i) { - byteBatch->data[i] = static_cast(i); + int8_t x = static_cast(i); + byteBatch->data[i] = x; + sum += x; } structBatch->numElements = rowCount; byteBatch->numElements = rowCount; @@ -429,13 +505,29 @@ namespace orc { EXPECT_EQ(rowCount, reader->getNumberOfRows()); batch = rowReader->createRowBatch(rowCount); + rowReader->seekToRow(20); EXPECT_EQ(true, rowReader->next(*batch)); + if (enableAlignBlockBoundToRowGroup) { + verifyCompressionBlockAlignment(reader, type->getSubtypeCount()); + } structBatch = dynamic_cast(batch.get()); - byteBatch = dynamic_cast(structBatch->fields[0]); - for (uint64_t i = 0; i < rowCount; ++i) { - EXPECT_EQ(static_cast(i), static_cast(byteBatch->data[i])); - } + auto outByteBatch = dynamic_cast(structBatch->fields[0]); + for (uint64_t i = 0; i < rowCount - 20; ++i) { + EXPECT_EQ(static_cast(i + 20), static_cast(outByteBatch->data[i])); + } + + auto col_stats = reader->getColumnStatistics(1); + ASSERT_NE(col_stats, nullptr); + EXPECT_EQ(col_stats->getNumberOfValues(), rowCount); + EXPECT_FALSE(col_stats->hasNull()); + auto int_stats = dynamic_cast(col_stats.get()); + ASSERT_NE(int_stats, nullptr); + EXPECT_TRUE(int_stats->hasMinimum() && int_stats->hasMaximum()); + EXPECT_EQ(int_stats->getMinimum(), -128); + EXPECT_EQ(int_stats->getMaximum(), 127); + EXPECT_TRUE(int_stats->hasSum()); + EXPECT_EQ(int_stats->getSum(), sum); } TEST_P(WriterTest, writeBooleanColumn) { @@ -446,10 +538,11 @@ namespace orc { uint64_t stripeSize = 16 * 1024; uint64_t compressionBlockSize = 1024; uint64_t rowCount = 65535; + uint64_t memoryBlockSize = 64; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion, enableAlignBlockBoundToRowGroup ? 1024 : 0); std::unique_ptr batch = writer->createRowBatch(rowCount); StructVectorBatch* structBatch = dynamic_cast(batch.get()); LongVectorBatch* byteBatch = dynamic_cast(structBatch->fields[0]); @@ -476,6 +569,9 @@ namespace orc { for (uint64_t i = 0; i < rowCount; ++i) { EXPECT_EQ((i % 3) == 0 ? 1 : 0, byteBatch->data[i]); } + if (enableAlignBlockBoundToRowGroup) { + verifyCompressionBlockAlignment(reader, type->getSubtypeCount()); + } } TEST_P(WriterTest, writeDate) { @@ -486,10 +582,11 @@ namespace orc { uint64_t stripeSize = 16 * 1024; uint64_t compressionBlockSize = 1024; uint64_t rowCount = 1024; + uint64_t memoryBlockSize = 64; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion, enableAlignBlockBoundToRowGroup ? 1024 : 0); std::unique_ptr batch = writer->createRowBatch(rowCount); StructVectorBatch* structBatch = dynamic_cast(batch.get()); @@ -517,6 +614,9 @@ namespace orc { for (uint64_t i = 0; i < rowCount; ++i) { EXPECT_EQ(static_cast(i), longBatch->data[i]); } + if (enableAlignBlockBoundToRowGroup) { + verifyCompressionBlockAlignment(reader, type->getSubtypeCount()); + } } TEST_P(WriterTest, writeTimestamp) { @@ -527,10 +627,11 @@ namespace orc { uint64_t stripeSize = 16 * 1024; uint64_t compressionBlockSize = 1024; uint64_t rowCount = 102400; + uint64_t memoryBlockSize = 64; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion, enableAlignBlockBoundToRowGroup ? 1024 : 0); std::unique_ptr batch = writer->createRowBatch(rowCount); StructVectorBatch* structBatch = dynamic_cast(batch.get()); TimestampVectorBatch* tsBatch = dynamic_cast(structBatch->fields[0]); @@ -562,14 +663,18 @@ namespace orc { EXPECT_EQ(times[i], tsBatch->data[i]); EXPECT_EQ(i * 1000, tsBatch->nanoseconds[i]); } + if (enableAlignBlockBoundToRowGroup) { + verifyCompressionBlockAlignment(reader, type->getSubtypeCount()); + } } TEST_P(WriterTest, writeNegativeTimestamp) { MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); MemoryPool* pool = getDefaultPool(); std::unique_ptr type(Type::buildTypeFromString("struct")); - auto writer = createWriter(16 * 1024 * 1024, 64 * 1024, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + auto writer = + createWriter(16 * 1024 * 1024, 64 * 1024, 256 * 1024, CompressionKind_ZLIB, *type, pool, + &memStream, fileVersion, enableAlignBlockBoundToRowGroup ? 1024 : 0); uint64_t batchCount = 5; auto batch = writer->createRowBatch(batchCount * 2); auto structBatch = dynamic_cast(batch.get()); @@ -619,6 +724,10 @@ namespace orc { } EXPECT_EQ(1000000, tsBatch->nanoseconds[i]); } + + if (enableAlignBlockBoundToRowGroup) { + verifyCompressionBlockAlignment(reader, type->getSubtypeCount()); + } } // TODO: Disable the test below for Windows for following reasons: @@ -638,10 +747,11 @@ namespace orc { uint64_t stripeSize = 16 * 1024; uint64_t compressionBlockSize = 1024; uint64_t rowCount = 1; + uint64_t memoryBlockSize = 64; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion, 0, writerTimezone); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion, 0, writerTimezone); auto batch = writer->createRowBatch(rowCount); auto& structBatch = dynamic_cast(*batch); auto& tsBatch = dynamic_cast(*structBatch.fields[0]); @@ -734,10 +844,11 @@ namespace orc { uint64_t stripeSize = 16 * 1024; uint64_t compressionBlockSize = 1024; uint64_t rowCount = 102400; + uint64_t memoryBlockSize = 64; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion, enableAlignBlockBoundToRowGroup ? 1024 : 0); std::unique_ptr batch = writer->createRowBatch(rowCount); StructVectorBatch* structBatch = dynamic_cast(batch.get()); TimestampVectorBatch* tsBatch = dynamic_cast(structBatch->fields[0]); @@ -769,6 +880,9 @@ namespace orc { EXPECT_EQ(times[i], tsBatch->data[i]); EXPECT_EQ(i * 1000, tsBatch->nanoseconds[i]); } + if (enableAlignBlockBoundToRowGroup) { + verifyCompressionBlockAlignment(reader, type->getSubtypeCount()); + } } TEST_P(WriterTest, writeCharAndVarcharColumn) { @@ -779,13 +893,14 @@ namespace orc { uint64_t stripeSize = 1024; uint64_t compressionBlockSize = 1024; uint64_t rowCount = 65535; + uint64_t memoryBlockSize = 64; char dataBuffer[327675]; uint64_t offset = 0; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion, enableAlignBlockBoundToRowGroup ? 1024 : 0); std::unique_ptr batch = writer->createRowBatch(rowCount); StructVectorBatch* structBatch = dynamic_cast(batch.get()); @@ -847,6 +962,9 @@ namespace orc { } EXPECT_FALSE(rowReader->next(*batch)); + if (enableAlignBlockBoundToRowGroup) { + verifyCompressionBlockAlignment(reader, type->getSubtypeCount()); + } } TEST_P(WriterTest, writeDecimal64Column) { @@ -858,10 +976,11 @@ namespace orc { uint64_t stripeSize = 16 * 1024; // 16K uint64_t compressionBlockSize = 1024; // 1k uint64_t rowCount = 1024; + uint64_t memoryBlockSize = 64; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion, enableAlignBlockBoundToRowGroup ? 1024 : 0); std::unique_ptr batch = writer->createRowBatch(rowCount); StructVectorBatch* structBatch = dynamic_cast(batch.get()); Decimal64VectorBatch* decBatch = dynamic_cast(structBatch->fields[0]); @@ -923,6 +1042,9 @@ namespace orc { EXPECT_EQ(dec, decBatch->values[i]); EXPECT_EQ(-dec, decBatch->values[i + maxPrecision]); } + if (enableAlignBlockBoundToRowGroup) { + verifyCompressionBlockAlignment(reader, type->getSubtypeCount()); + } } TEST_P(WriterTest, writeDecimal128Column) { @@ -934,10 +1056,11 @@ namespace orc { uint64_t stripeSize = 16 * 1024; uint64_t compressionBlockSize = 1024; uint64_t rowCount = 1024; + uint64_t memoryBlockSize = 64; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion, enableAlignBlockBoundToRowGroup ? 1024 : 0); std::unique_ptr batch = writer->createRowBatch(rowCount); StructVectorBatch* structBatch = dynamic_cast(batch.get()); Decimal128VectorBatch* decBatch = dynamic_cast(structBatch->fields[0]); @@ -1009,6 +1132,9 @@ namespace orc { EXPECT_EQ(expected, decBatch->values[i].toString()); EXPECT_EQ("-" + expected, decBatch->values[i + maxPrecision].toString()); } + if (enableAlignBlockBoundToRowGroup) { + verifyCompressionBlockAlignment(reader, type->getSubtypeCount()); + } } TEST_P(WriterTest, writeListColumn) { @@ -1022,10 +1148,11 @@ namespace orc { uint64_t rowCount = 1024; uint64_t maxListLength = 10; uint64_t offset = 0; + uint64_t memoryBlockSize = 8 * 1024; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion, enableAlignBlockBoundToRowGroup ? 1024 : 0); std::unique_ptr batch = writer->createRowBatch(rowCount * maxListLength); StructVectorBatch* structBatch = dynamic_cast(batch.get()); @@ -1071,6 +1198,9 @@ namespace orc { EXPECT_EQ(static_cast(i), data[offsets[i] + j]); } } + if (enableAlignBlockBoundToRowGroup) { + verifyCompressionBlockAlignment(reader, type->getSubtypeCount()); + } } TEST_P(WriterTest, writeMapColumn) { @@ -1081,10 +1211,11 @@ namespace orc { uint64_t stripeSize = 16 * 1024; uint64_t compressionBlockSize = 1024; uint64_t rowCount = 1024, maxListLength = 10, offset = 0; + uint64_t memoryBlockSize = 64; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion, enableAlignBlockBoundToRowGroup ? 1024 : 0); std::unique_ptr batch = writer->createRowBatch(rowCount * maxListLength); StructVectorBatch* structBatch = dynamic_cast(batch.get()); MapVectorBatch* mapBatch = dynamic_cast(structBatch->fields[0]); @@ -1151,6 +1282,9 @@ namespace orc { EXPECT_EQ(static_cast(i), elemData[offsets[i] + j]); } } + if (enableAlignBlockBoundToRowGroup) { + verifyCompressionBlockAlignment(reader, type->getSubtypeCount()); + } } TEST_P(WriterTest, writeUnionColumn) { @@ -1162,10 +1296,11 @@ namespace orc { uint64_t stripeSize = 16 * 1024; uint64_t compressionBlockSize = 1024; uint64_t rowCount = 3333; + uint64_t memoryBlockSize = 64; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion, enableAlignBlockBoundToRowGroup ? 1024 : 0); std::unique_ptr batch = writer->createRowBatch(rowCount); StructVectorBatch* structBatch = dynamic_cast(batch.get()); UnionVectorBatch* unionBatch = dynamic_cast(structBatch->fields[0]); @@ -1247,6 +1382,9 @@ namespace orc { break; } } + if (enableAlignBlockBoundToRowGroup) { + verifyCompressionBlockAlignment(reader, type->getSubtypeCount()); + } } TEST_P(WriterTest, writeUTF8CharAndVarcharColumn) { @@ -1257,9 +1395,10 @@ namespace orc { uint64_t stripeSize = 1024; uint64_t compressionBlockSize = 1024; uint64_t rowCount = 3; + uint64_t memoryBlockSize = 64; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion, enableAlignBlockBoundToRowGroup ? 1024 : 0); std::unique_ptr batch = writer->createRowBatch(rowCount); StructVectorBatch* structBatch = dynamic_cast(batch.get()); StringVectorBatch* charBatch = dynamic_cast(structBatch->fields[0]); @@ -1317,6 +1456,9 @@ namespace orc { EXPECT_TRUE(memcmp(varcharBatch->data[2], expectedTwoChars, 4) == 0); EXPECT_FALSE(rowReader->next(*batch)); + if (enableAlignBlockBoundToRowGroup) { + verifyCompressionBlockAlignment(reader, type->getSubtypeCount()); + } } TEST_P(WriterTest, testWriteListColumnWithNull) { @@ -1326,10 +1468,11 @@ namespace orc { uint64_t stripeSize = 1024; uint64_t compressionBlockSize = 1024; + uint64_t memoryBlockSize = 64; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion); std::unique_ptr batch = writer->createRowBatch(4); StructVectorBatch* structBatch = dynamic_cast(batch.get()); @@ -1407,10 +1550,11 @@ namespace orc { uint64_t stripeSize = 1024; uint64_t compressionBlockSize = 1024; + uint64_t memoryBlockSize = 64; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion); // test data looks like below - // {0} @@ -1485,12 +1629,13 @@ namespace orc { uint64_t stripeSize = 1024; uint64_t compressionBlockSize = 1024; + uint64_t memoryBlockSize = 64; // 10000 rows with every 1000 row as an RG // Each RG has 100 null rows except that the 5th RG is all null std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion, 1000); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion, 1000); std::unique_ptr batch = writer->createRowBatch(10000); StructVectorBatch* structBatch = dynamic_cast(batch.get()); @@ -1622,12 +1767,13 @@ namespace orc { TEST_P(WriterTest, testBloomFilter) { WriterOptions options; options.setStripeSize(1024) - .setCompressionBlockSize(64) + .setCompressionBlockSize(1024) .setCompression(CompressionKind_ZSTD) .setMemoryPool(getDefaultPool()) .setRowIndexStride(10000) .setFileVersion(fileVersion) - .setColumnsUseBloomFilter({1, 2, 3}); + .setColumnsUseBloomFilter({1, 2, 3}) + .setMemoryBlockSize(64); // write 65535 rows of data MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); @@ -1716,7 +1862,7 @@ namespace orc { auto type = std::unique_ptr(Type::buildTypeFromString("struct")); WriterOptions options; options.setStripeSize(1024 * 1024) - .setCompressionBlockSize(1024) + .setMemoryBlockSize(1024) .setCompression(CompressionKind_NONE) .setMemoryPool(pool) .setRowIndexStride(1000); @@ -1809,8 +1955,11 @@ namespace orc { uint64_t rowCount = 5000000; auto type = std::unique_ptr(Type::buildTypeFromString("struct")); WriterOptions options; - options.setStripeSize(1024).setCompressionBlockSize(1024).setCompression(kind).setMemoryPool( - pool); + options.setStripeSize(1024) + .setCompressionBlockSize(1024) + .setMemoryBlockSize(64) + .setCompression(kind) + .setMemoryPool(pool); auto writer = createWriter(*type, &memStream, options); auto batch = writer->createRowBatch(rowCount); @@ -1853,10 +2002,11 @@ namespace orc { WriterOptions options; options.setStripeSize(1024 * 1024) .setCompressionBlockSize(64 * 1024) + .setMemoryBlockSize(1024) .setCompression(CompressionKind_NONE) .setMemoryPool(pool) .setRowIndexStride(1000) - .setOutputBufferCapacity(capacity); + .setCompressionBlockSize(capacity); auto writer = createWriter(*type, &memStream, options); auto batch = writer->createRowBatch(rowCount); @@ -1913,6 +2063,7 @@ namespace orc { uint64_t stripeSize = 16 * 1024; uint64_t compressionBlockSize = 1024; uint64_t rowCount = 65530; + uint64_t memoryBlockSize = 64; std::vector data(rowCount); for (uint64_t i = 0; i < rowCount; ++i) { @@ -1920,8 +2071,8 @@ namespace orc { } std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion, 0, "GMT", true); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion, 0, "GMT", true); // start from here/ std::unique_ptr batch = writer->createRowBatch(rowCount / 2); StructVectorBatch* structBatch = dynamic_cast(batch.get()); @@ -2010,10 +2161,11 @@ namespace orc { uint64_t stripeSize = 1024; // 1K uint64_t compressionBlockSize = 1024; // 1k + uint64_t memoryBlockSize = 64; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion); std::unique_ptr batch = writer->createRowBatch(65535); StructVectorBatch* structBatch = dynamic_cast(batch.get()); LongVectorBatch* longBatch = dynamic_cast(structBatch->fields[0]); @@ -2065,10 +2217,11 @@ namespace orc { uint64_t stripeSize = 1024; // 1K uint64_t compressionBlockSize = 1024; // 1k + uint64_t memoryBlockSize = 64; std::unique_ptr writer = - createWriter(stripeSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, - &memStream, fileVersion); + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion); std::unique_ptr batch = writer->createRowBatch(65535); StructVectorBatch* structBatch = dynamic_cast(batch.get()); LongVectorBatch* longBatch = dynamic_cast(structBatch->fields[0]); @@ -2131,6 +2284,7 @@ namespace orc { WriterOptions options; options.setStripeSize(16 * 1024) .setCompressionBlockSize(1024) + .setMemoryBlockSize(64) .setCompression(CompressionKind_NONE) .setMemoryPool(pool) .setRowIndexStride(1000); @@ -2201,7 +2355,192 @@ namespace orc { std::invalid_argument); } - INSTANTIATE_TEST_SUITE_P(OrcTest, WriterTest, - Values(FileVersion::v_0_11(), FileVersion::v_0_12(), - FileVersion::UNSTABLE_PRE_2_0())); + TEST_P(WriterTest, testLazyLoadTZDB) { + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); + MemoryPool* pool = getDefaultPool(); + std::unique_ptr type(Type::buildTypeFromString("struct")); + + uint64_t stripeSize = 1024; // 1K + uint64_t compressionBlockSize = 1024; // 1k + uint64_t memoryBlockSize = 64; + + std::unique_ptr writer = + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion, 0, "/ERROR/TIMEZONE"); + std::unique_ptr batch = writer->createRowBatch(10); + StructVectorBatch* structBatch = dynamic_cast(batch.get()); + LongVectorBatch* longBatch = dynamic_cast(structBatch->fields[0]); + + for (uint64_t j = 0; j < 10; ++j) { + for (uint64_t i = 0; i < 10; ++i) { + longBatch->data[i] = static_cast(i); + } + structBatch->numElements = 10; + longBatch->numElements = 10; + + writer->add(*batch); + } + + writer->close(); + + auto inStream = std::make_unique(memStream.getData(), memStream.getLength()); + std::unique_ptr reader = createReader(pool, std::move(inStream)); + std::unique_ptr rowReader = createRowReader(reader.get(), "/ERROR/TIMEZONE"); + EXPECT_EQ(100, reader->getNumberOfRows()); + + batch = rowReader->createRowBatch(10); + for (uint64_t j = 0; j < 10; ++j) { + EXPECT_TRUE(rowReader->next(*batch)); + EXPECT_EQ(10, batch->numElements); + + for (uint64_t i = 0; i < 10; ++i) { + structBatch = dynamic_cast(batch.get()); + longBatch = dynamic_cast(structBatch->fields[0]); + EXPECT_EQ(i, longBatch->data[i]); + } + } + EXPECT_FALSE(rowReader->next(*batch)); + } + + TEST_P(WriterTest, writeGeometryAndGeographyColumn) { + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); + MemoryPool* pool = getDefaultPool(); + std::unique_ptr type(Type::buildTypeFromString( + "struct")); + uint64_t stripeSize = 1024; // 1K + uint64_t compressionBlockSize = 1024; // 1k + uint64_t memoryBlockSize = 64; + std::unique_ptr writer = + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion, enableAlignBlockBoundToRowGroup ? 1024 : 0); + + EXPECT_EQ("struct", + type->toString()); + + uint64_t batchCount = 100, batchSize = 1000; + std::unique_ptr batch = writer->createRowBatch(batchSize); + StructVectorBatch* structBatch = dynamic_cast(batch.get()); + StringVectorBatch* geometryBatch = dynamic_cast(structBatch->fields[0]); + StringVectorBatch* geographyBatch = dynamic_cast(structBatch->fields[1]); + + std::unique_ptr buffer(new char[8000000]); + char* buf = buffer.get(); + + // write 100 * 1000 rows, every 100 rows are in one row group + // every 2 consecutive rows has one null value. + uint64_t rowCount = 0; + for (uint64_t i = 0; i != batchCount; ++i) { + structBatch->hasNulls = false; + structBatch->numElements = batchSize; + + geometryBatch->hasNulls = true; + geometryBatch->numElements = batchSize; + geographyBatch->hasNulls = true; + geographyBatch->numElements = batchSize; + + for (uint64_t j = 0; j != batchSize; ++j) { + if (rowCount % 2 == 0) { + geometryBatch->notNull[j] = 0; + geographyBatch->notNull[j] = 0; + } else { + geometryBatch->notNull[j] = 1; + geographyBatch->notNull[j] = 1; + + std::string wkb = MakeWKBPoint({j * 1.0, j * 1.0}, false, false); + strncpy(buf, wkb.c_str(), wkb.size()); + + geometryBatch->data[j] = buf; + geometryBatch->length[j] = static_cast(wkb.size()); + geographyBatch->data[j] = buf; + geographyBatch->length[j] = static_cast(wkb.size()); + + buf += wkb.size(); + } + ++rowCount; + } + + writer->add(*batch); + } + writer->close(); + + std::unique_ptr inStream( + new MemoryInputStream(memStream.getData(), memStream.getLength())); + std::unique_ptr reader = createReader(pool, std::move(inStream)); + EXPECT_EQ(batchCount * batchSize, reader->getNumberOfRows()); + EXPECT_TRUE(reader->getNumberOfStripes() > 1); + + EXPECT_EQ("struct", + reader->getType().toString()); + // test sequential reader + std::unique_ptr seqReader = createRowReader(reader.get()); + rowCount = 0; + for (uint64_t i = 0; i != batchCount; ++i) { + seqReader->next(*batch); + + EXPECT_FALSE(structBatch->hasNulls); + EXPECT_EQ(batchSize, structBatch->numElements); + + EXPECT_TRUE(geometryBatch->hasNulls); + EXPECT_EQ(batchSize, geometryBatch->numElements); + EXPECT_TRUE(geographyBatch->hasNulls); + EXPECT_EQ(batchSize, geographyBatch->numElements); + + for (uint64_t j = 0; j != batchSize; ++j) { + if (rowCount % 2 == 0) { + EXPECT_TRUE(geometryBatch->notNull[j] == 0); + EXPECT_TRUE(geographyBatch->notNull[j] == 0); + } else { + EXPECT_TRUE(geometryBatch->notNull[j] != 0); + EXPECT_TRUE(geographyBatch->notNull[j] != 0); + std::string wkb = MakeWKBPoint({j * 1.0, j * 1.0}, false, false); + EXPECT_EQ(static_cast(wkb.size()), geometryBatch->length[j]); + EXPECT_TRUE(strncmp(geometryBatch->data[j], wkb.c_str(), wkb.size()) == 0); + EXPECT_EQ(static_cast(wkb.size()), geographyBatch->length[j]); + EXPECT_TRUE(strncmp(geographyBatch->data[j], wkb.c_str(), wkb.size()) == 0); + } + ++rowCount; + } + } + EXPECT_FALSE(seqReader->next(*batch)); + + // test seek reader + std::unique_ptr seekReader = createRowReader(reader.get()); + batch = seekReader->createRowBatch(2); + structBatch = dynamic_cast(batch.get()); + geometryBatch = dynamic_cast(structBatch->fields[0]); + geographyBatch = dynamic_cast(structBatch->fields[1]); + + for (uint64_t row = rowCount - 2; row >= 100; row -= 100) { + seekReader->seekToRow(row); + seekReader->next(*batch); + + EXPECT_FALSE(structBatch->hasNulls); + EXPECT_EQ(2, structBatch->numElements); + EXPECT_TRUE(geometryBatch->hasNulls); + EXPECT_EQ(2, geometryBatch->numElements); + EXPECT_TRUE(geographyBatch->hasNulls); + EXPECT_EQ(2, geographyBatch->numElements); + + EXPECT_TRUE(geometryBatch->notNull[0] == 0); + EXPECT_TRUE(geometryBatch->notNull[1] != 0); + EXPECT_TRUE(geographyBatch->notNull[0] == 0); + EXPECT_TRUE(geographyBatch->notNull[1] != 0); + + std::string wkb = MakeWKBPoint({(row + 1) * 1.0, (row + 1) * 1.0}, false, false); + + EXPECT_EQ(static_cast(wkb.size()), geometryBatch->length[1]); + EXPECT_TRUE(strncmp(geometryBatch->data[1], wkb.c_str(), wkb.size()) == 0); + EXPECT_EQ(static_cast(wkb.size()), geographyBatch->length[1]); + EXPECT_TRUE(strncmp(geographyBatch->data[1], wkb.c_str(), wkb.size()) == 0); + } + } + + std::vector testParams = {{FileVersion::v_0_11(), true}, + {FileVersion::v_0_11(), false}, + {FileVersion::v_0_12(), false}, + {FileVersion::v_0_12(), true}, + {FileVersion::UNSTABLE_PRE_2_0(), false}, + {FileVersion::UNSTABLE_PRE_2_0(), true}}; + + INSTANTIATE_TEST_SUITE_P(OrcTest, WriterTest, ::testing::ValuesIn(testParams)); } // namespace orc diff --git a/c++/test/meson.build b/c++/test/meson.build new file mode 100644 index 0000000000..75dcbb0940 --- /dev/null +++ b/c++/test/meson.build @@ -0,0 +1,88 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +test_incdir = include_directories( + '../include', + '../src', +) + +test_sources = [ + 'MemoryInputStream.cc', + 'MemoryOutputStream.cc', + 'MockStripeStreams.cc', + 'TestAttributes.cc', + 'TestBlockBuffer.cc', + 'TestBufferedOutputStream.cc', + 'TestBloomFilter.cc', + 'TestByteRle.cc', + 'TestByteRLEEncoder.cc', + 'TestColumnPrinter.cc', + 'TestColumnReader.cc', + 'TestColumnStatistics.cc', + 'TestCompression.cc', + 'TestConvertColumnReader.cc', + 'TestDecompression.cc', + 'TestDecimal.cc', + 'TestDictionaryEncoding.cc', + 'TestDriver.cc', + 'TestInt128.cc', + 'TestMurmur3.cc', + 'TestPredicateLeaf.cc', + 'TestPredicatePushdown.cc', + 'TestReader.cc', + 'TestRleDecoder.cc', + 'TestRleEncoder.cc', + 'TestRLEV2Util.cc', + 'TestSargsApplier.cc', + 'TestSearchArgument.cc', + 'TestSchemaEvolution.cc', + 'TestStatistics.cc', + 'TestStripeIndexStatistics.cc', + 'TestTimestampStatistics.cc', + 'TestTimezone.cc', + 'TestType.cc', + 'TestUtil.cc', + 'TestWriter.cc', + 'TestCache.cc', +] + +orc_test = executable( + 'orc-test', + sources: test_sources, + include_directories: test_incdir, + dependencies: [ + orc_dep, + lz4_dep, + protobuf_dep, + snappy_dep, + zlib_dep, + gtest_dep, + gmock_dep, + sparsehash_c11_dep, + ], +) + +exc = executable( + 'create-test-files', + sources: ['CreateTestFiles.cc'], + include_directories: test_incdir, + dependencies: [ + orc_dep, + protobuf_dep, + ], +) +test('orc-test', exc) diff --git a/cmake_modules/CheckFormat.cmake b/cmake_modules/CheckFormat.cmake new file mode 100644 index 0000000000..17017da133 --- /dev/null +++ b/cmake_modules/CheckFormat.cmake @@ -0,0 +1,111 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Needed for linting targets, etc. +# Use the first Python installation on PATH, not the newest one +set(Python3_FIND_STRATEGY "LOCATION") +# On Windows, use registry last, not first +set(Python3_FIND_REGISTRY "LAST") +# On macOS, use framework last, not first +set(Python3_FIND_FRAMEWORK "LAST") + +find_package(Python3) +set(PYTHON_EXECUTABLE ${Python3_EXECUTABLE}) + +set(BUILD_SUPPORT_DIR "${PROJECT_SOURCE_DIR}/c++/build-support") + +find_program(CLANG_FORMAT_BIN + NAMES clang-format-13 + HINTS ${CLANG_SEARCH_PATH}) + +find_program(CLANG_TIDY_BIN + NAMES clang-tidy-13 + HINTS ${CLANG_SEARCH_PATH}) + +find_program(CLANG_APPLY_REPLACEMENTS_BIN + NAMES clang-apply-replacements-13 + HINTS ${CLANG_SEARCH_PATH}) + + +if("${CLANG_FORMAT_BIN}" STREQUAL "CLANG_FORMAT_BIN-NOTFOUND") + message(WARNING "Couldn't find clang-format.") +else() + message(STATUS "Found clang-format at ${CLANG_FORMAT_BIN}") +endif() + +if("${CLANG_TIDY_BIN}" STREQUAL "CLANG_TIDY_BIN-NOTFOUND") + message(WARNING "Couldn't find clang-tidy.") +else() + # Output compile_commands.json + set(CMAKE_EXPORT_COMPILE_COMMANDS 1) + message(STATUS "Found clang-tidy at ${CLANG_TIDY_BIN}") +endif() + +if("${CLANG_APPLY_REPLACEMENTS_BIN}" STREQUAL "CLANG_APPLY_REPLACEMENTS_BIN-NOTFOUND") + message(WARNING "Couldn't find clang-apply-replacements.") +else() + # Output compile_commands.json + set(CMAKE_EXPORT_COMPILE_COMMANDS 1) + message(STATUS "Found clang-apply-replacements at ${CLANG_APPLY_REPLACEMENTS_BIN}") +endif() + +if(NOT LINT_EXCLUSIONS_FILE) + # source files matching a glob from a line in this file + # will be excluded from linting (cpplint, clang-tidy, clang-format) + set(LINT_EXCLUSIONS_FILE ${BUILD_SUPPORT_DIR}/lint_exclusions.txt) +endif() + +# runs clang-tidy and exits with a non-zero exit code if any errors are found. +# note that clang-tidy automatically looks for a .clang-tidy file in parent directories +add_custom_target(check-clang-tidy + ${PYTHON_EXECUTABLE} + ${BUILD_SUPPORT_DIR}/run_clang_tidy.py # run LLVM's clang-tidy script + -clang-tidy-binary ${CLANG_TIDY_BIN} # using our clang-tidy binary + -p ${PROJECT_BINARY_DIR} # using cmake's generated compile commands +) + +add_custom_target(fix-clang-tidy + ${PYTHON_EXECUTABLE} + ${BUILD_SUPPORT_DIR}/run_clang_tidy.py # run LLVM's clang-tidy script + -clang-tidy-binary ${CLANG_TIDY_BIN} # using our clang-tidy binary + -p ${PROJECT_BINARY_DIR} # using cmake's generated compile commands + -clang-apply-replacements-binary ${CLANG_APPLY_REPLACEMENTS_BIN} # using our clang-apply-replacements binary + -fix # apply suggested changes generated by clang-tidy +) + +string(CONCAT ORC_FORMAT_DIRS + "${PROJECT_SOURCE_DIR}/c++," + "${PROJECT_SOURCE_DIR}/tools," +) + +add_custom_target(format + ${PYTHON_EXECUTABLE} + ${BUILD_SUPPORT_DIR}/run_clang_format.py + ${CLANG_FORMAT_BIN} + --source_dirs + ${ORC_FORMAT_DIRS} + --fix +) + +# Runs clang format and exits with a non-zero exit code if any files need to be reformatted +add_custom_target(check-format + ${PYTHON_EXECUTABLE} + ${BUILD_SUPPORT_DIR}/run_clang_format.py + ${CLANG_FORMAT_BIN} + --source_dirs + ${ORC_FORMAT_DIRS} +) \ No newline at end of file diff --git a/cmake_modules/FindLZ4.cmake b/cmake_modules/FindLZ4.cmake index b1557f496b..3b9cc7fbd1 100644 --- a/cmake_modules/FindLZ4.cmake +++ b/cmake_modules/FindLZ4.cmake @@ -22,6 +22,16 @@ # LZ4_STATIC_LIB: path to lz4.a # LZ4_FOUND: whether LZ4 has been found +if (NOT LZ4_HOME) + if (DEFINED ENV{LZ4_HOME}) + set (LZ4_HOME "$ENV{LZ4_HOME}") + elseif (LZ4_ROOT) + set (LZ4_HOME "${LZ4_ROOT}") + elseif (DEFINED ENV{LZ4_ROOT}) + set (LZ4_HOME "$ENV{LZ4_ROOT}") + endif () +endif () + if( NOT "${LZ4_HOME}" STREQUAL "") file (TO_CMAKE_PATH "${LZ4_HOME}" _lz4_path) endif() @@ -33,7 +43,7 @@ find_path (LZ4_INCLUDE_DIR lz4.h HINTS NO_DEFAULT_PATH PATH_SUFFIXES "include") -find_library (LZ4_LIBRARY NAMES lz4 HINTS +find_library (LZ4_LIBRARY NAMES lz4 liblz4 HINTS ${_lz4_path} PATH_SUFFIXES "lib" "lib64") @@ -74,3 +84,10 @@ mark_as_advanced ( LZ4_STATIC_LIB LZ4_LIBRARY ) + +if(LZ4_FOUND AND NOT TARGET LZ4::lz4) + add_library(LZ4::lz4 UNKNOWN IMPORTED) + set_target_properties(LZ4::lz4 + PROPERTIES IMPORTED_LOCATION "${LZ4_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES "${LZ4_INCLUDE_DIR}") +endif() diff --git a/cmake_modules/FindProtobuf.cmake b/cmake_modules/FindProtobuf.cmake index cca7c8b87e..ca91fb5ade 100644 --- a/cmake_modules/FindProtobuf.cmake +++ b/cmake_modules/FindProtobuf.cmake @@ -17,7 +17,7 @@ # PROTOBUF_HOME environmental variable is used to check for Protobuf headers and static library -# PROTOBUF_FOUND is set if Protobuf is found +# Protobuf_FOUND is set if Protobuf is found # PROTOBUF_INCLUDE_DIR: directory containing headers # PROTOBUF_LIBRARY: location of libprotobuf # PROTOBUF_STATIC_LIB: location of protobuf.a @@ -25,6 +25,19 @@ # PROTOC_STATIC_LIB: location of protoc.a # PROTOBUF_EXECUTABLE: location of protoc +if (NOT PROTOBUF_HOME) + if (DEFINED ENV{PROTOBUF_HOME}) + set (PROTOBUF_HOME "$ENV{PROTOBUF_HOME}") + elseif (Protobuf_ROOT) + set (PROTOBUF_HOME "${Protobuf_ROOT}") + elseif (DEFINED ENV{Protobuf_ROOT}) + set (PROTOBUF_HOME "$ENV{Protobuf_ROOT}") + elseif (PROTOBUF_ROOT) + set (PROTOBUF_HOME "${PROTOBUF_ROOT}") + elseif (DEFINED ENV{PROTOBUF_ROOT}) + set (PROTOBUF_HOME "$ENV{PROTOBUF_ROOT}") + endif () +endif () if( NOT "${PROTOBUF_HOME}" STREQUAL "") file (TO_CMAKE_PATH "${PROTOBUF_HOME}" _protobuf_path) @@ -32,8 +45,17 @@ endif() message (STATUS "PROTOBUF_HOME: ${PROTOBUF_HOME}") +if (NOT DEFINED CMAKE_STATIC_LIBRARY_SUFFIX) + if (WIN32) + set (CMAKE_STATIC_LIBRARY_SUFFIX ".lib") + else () + set (CMAKE_STATIC_LIBRARY_SUFFIX ".a") + endif () +endif () + find_package (Protobuf CONFIG) if (Protobuf_FOUND) + if (TARGET protobuf::libprotobuf) set (PROTOBUF_LIBRARY protobuf::libprotobuf) set (PROTOBUF_STATIC_LIB PROTOBUF_STATIC_LIB-NOTFOUND) set (PROTOC_LIBRARY protobuf::libprotoc) @@ -42,15 +64,34 @@ if (Protobuf_FOUND) get_target_property (target_type protobuf::libprotobuf TYPE) if (target_type STREQUAL "STATIC_LIBRARY") - set(PROTOBUF_STATIC_LIB protobuf::libprotobuf) + set (PROTOBUF_STATIC_LIB protobuf::libprotobuf) endif () get_target_property (target_type protobuf::libprotoc TYPE) if (target_type STREQUAL "STATIC_LIBRARY") - set (PROTOC_STATIC_LIB protobuf::libprotoc) + set (PROTOC_STATIC_LIB protobuf::libprotoc) endif () - get_target_property (PROTOBUF_INCLUDE_DIR protobuf::libprotoc INTERFACE_INCLUDE_DIRECTORIES) + get_target_property (PROTOBUF_INCLUDE_DIR protobuf::libprotobuf INTERFACE_INCLUDE_DIRECTORIES) + if (NOT PROTOBUF_INCLUDE_DIR) + set (PROTOBUF_INCLUDE_DIR ${Protobuf_INCLUDE_DIRS}) + if (NOT PROTOBUF_INCLUDE_DIR) + message (FATAL_ERROR "Cannot determine Protobuf include directory from protobuf::libprotobuf and Protobuf_INCLUDE_DIRS.") + endif () + endif () + else () + set (PROTOBUF_LIBRARY ${Protobuf_LIBRARIES}) + set (PROTOBUF_INCLUDE_DIR ${Protobuf_INCLUDE_DIRS}) + if (NOT PROTOBUF_INCLUDE_DIR) + message (FATAL_ERROR "Cannot determine Protobuf include directory.") + endif () + + if (Protobuf_LIBRARIES MATCHES "\\${CMAKE_STATIC_LIBRARY_SUFFIX}$") + set (PROTOBUF_STATIC_LIB ${Protobuf_LIBRARIES}) + else () + set (PROTOBUF_STATIC_LIB PROTOBUF_STATIC_LIB-NOTFOUND) + endif () + endif () else() find_path (PROTOBUF_INCLUDE_DIR google/protobuf/io/zero_copy_stream.h HINTS @@ -63,7 +104,7 @@ else() NO_DEFAULT_PATH PATH_SUFFIXES "include") - find_library (PROTOBUF_LIBRARY NAMES protobuf HINTS + find_library (PROTOBUF_LIBRARY NAMES protobuf libprotobuf HINTS ${_protobuf_path} PATH_SUFFIXES "lib") @@ -71,7 +112,7 @@ else() ${_protobuf_path} PATH_SUFFIXES "lib") - find_library (PROTOC_LIBRARY NAMES protoc HINTS + find_library (PROTOC_LIBRARY NAMES protoc libprotoc HINTS ${_protobuf_path} PATH_SUFFIXES "lib") @@ -86,14 +127,14 @@ else() endif () if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOC_LIBRARY AND PROTOBUF_EXECUTABLE) - set (PROTOBUF_FOUND TRUE) + set (Protobuf_FOUND TRUE) set (PROTOBUF_LIB_NAME protobuf) set (PROTOC_LIB_NAME protoc) else () - set (PROTOBUF_FOUND FALSE) + set (Protobuf_FOUND FALSE) endif () -if (PROTOBUF_FOUND) +if (Protobuf_FOUND) message (STATUS "Found the Protobuf headers: ${PROTOBUF_INCLUDE_DIR}") message (STATUS "Found the Protobuf library: ${PROTOBUF_LIBRARY}") message (STATUS "Found the Protoc library: ${PROTOC_LIBRARY}") @@ -125,3 +166,10 @@ mark_as_advanced ( PROTOC_STATIC_LIB PROTOC_LIBRARY ) + +if(Protobuf_FOUND AND NOT TARGET protobuf::libprotobuf) + add_library(protobuf::libprotobuf UNKNOWN IMPORTED) + set_target_properties(protobuf::libprotobuf + PROPERTIES IMPORTED_LOCATION "${PROTOBUF_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES "${PROTOBUF_INCLUDE_DIR}") +endif() diff --git a/cmake_modules/FindSnappy.cmake b/cmake_modules/FindSnappy.cmake index f0a0773801..1ad9914542 100644 --- a/cmake_modules/FindSnappy.cmake +++ b/cmake_modules/FindSnappy.cmake @@ -20,7 +20,21 @@ # SNAPPY_INCLUDE_DIR: directory containing headers # SNAPPY_LIBRARY: path to libsnappy # SNAPPY_STATIC_LIB: path to libsnappy.a -# SNAPPY_FOUND: whether snappy has been found +# Snappy_FOUND: whether snappy has been found + +if (NOT SNAPPY_HOME) + if (DEFINED ENV{SNAPPY_HOME}) + set (SNAPPY_HOME "$ENV{SNAPPY_HOME}") + elseif (Snappy_ROOT) + set (SNAPPY_HOME "${Snappy_ROOT}") + elseif (DEFINED ENV{Snappy_ROOT}) + set (SNAPPY_HOME "$ENV{Snappy_ROOT}") + elseif (SNAPPY_ROOT) + set (SNAPPY_HOME "${SNAPPY_ROOT}") + elseif (DEFINED ENV{SNAPPY_ROOT}) + set (SNAPPY_HOME "$ENV{SNAPPY_ROOT}") + endif () +endif () if( NOT "${SNAPPY_HOME}" STREQUAL "") file (TO_CMAKE_PATH "${SNAPPY_HOME}" _snappy_path) @@ -42,14 +56,14 @@ find_library (SNAPPY_STATIC_LIB NAMES ${CMAKE_STATIC_LIBRARY_PREFIX}${SNAPPY_LIB PATH_SUFFIXES "lib" "lib64") if (SNAPPY_INCLUDE_DIR AND SNAPPY_LIBRARY) - set (SNAPPY_FOUND TRUE) + set (Snappy_FOUND TRUE) set (SNAPPY_HEADER_NAME snappy.h) set (SNAPPY_HEADER ${SNAPPY_INCLUDE_DIR}/${SNAPPY_HEADER_NAME}) else () - set (SNAPPY_FOUND FALSE) + set (Snappy_FOUND FALSE) endif () -if (SNAPPY_FOUND) +if (Snappy_FOUND) message (STATUS "Found the Snappy header: ${SNAPPY_HEADER}") message (STATUS "Found the Snappy library: ${SNAPPY_LIBRARY}") if (SNAPPY_STATIC_LIB) @@ -74,3 +88,10 @@ mark_as_advanced ( SNAPPY_STATIC_LIB SNAPPY_LIBRARY ) + +if(Snappy_FOUND AND NOT TARGET Snappy::snappy) + add_library(Snappy::snappy UNKNOWN IMPORTED) + set_target_properties(Snappy::snappy + PROPERTIES IMPORTED_LOCATION "${SNAPPY_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES "${SNAPPY_INCLUDE_DIR}") +endif() diff --git a/cmake_modules/FindZLIB.cmake b/cmake_modules/FindZLIB.cmake index 2f83a974cd..374814a7f9 100644 --- a/cmake_modules/FindZLIB.cmake +++ b/cmake_modules/FindZLIB.cmake @@ -22,6 +22,16 @@ # ZLIB_STATIC_LIB: path to zlib.a # ZLIB_FOUND: whether ZLIB has been found +if (NOT ZLIB_HOME) + if (DEFINED ENV{ZLIB_HOME}) + set (ZLIB_HOME "$ENV{ZLIB_HOME}") + elseif (ZLIB_ROOT) + set (ZLIB_HOME "${ZLIB_ROOT}") + elseif (DEFINED ENV{ZLIB_ROOT}) + set (ZLIB_HOME "$ENV{ZLIB_ROOT}") + endif () +endif () + if( NOT "${ZLIB_HOME}" STREQUAL "") file (TO_CMAKE_PATH "${ZLIB_HOME}" _zlib_path) endif() @@ -78,3 +88,10 @@ mark_as_advanced ( ZLIB_STATIC_LIB ZLIB_LIBRARY ) + +if(ZLIB_FOUND AND NOT TARGET ZLIB::ZLIB) + add_library(ZLIB::ZLIB UNKNOWN IMPORTED) + set_target_properties(ZLIB::ZLIB + PROPERTIES IMPORTED_LOCATION "${ZLIB_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES "${ZLIB_INCLUDE_DIR}") +endif() diff --git a/cmake_modules/FindZSTD.cmake b/cmake_modules/FindZSTD.cmake index 7ec197221d..581719453c 100644 --- a/cmake_modules/FindZSTD.cmake +++ b/cmake_modules/FindZSTD.cmake @@ -22,6 +22,16 @@ # ZSTD_STATIC_LIB: path to libzstd.a # ZSTD_FOUND: whether zstd has been found +if (NOT ZSTD_HOME) + if (DEFINED ENV{ZSTD_HOME}) + set (ZSTD_HOME "$ENV{ZSTD_HOME}") + elseif (ZSTD_ROOT) + set (ZSTD_HOME "${ZSTD_ROOT}") + elseif (DEFINED ENV{ZSTD_ROOT}) + set (ZSTD_HOME "$ENV{ZSTD_ROOT}") + endif () +endif () + if( NOT "${ZSTD_HOME}" STREQUAL "") file (TO_CMAKE_PATH "${ZSTD_HOME}" _zstd_path) endif() @@ -74,3 +84,18 @@ mark_as_advanced ( ZSTD_STATIC_LIB ZSTD_LIBRARY ) + +if(ZSTD_FOUND) + if(NOT TARGET zstd::libzstd_static AND ZSTD_STATIC_LIB) + add_library(zstd::libzstd_static STATIC IMPORTED) + set_target_properties(zstd::libzstd_static + PROPERTIES IMPORTED_LOCATION "${ZSTD_STATIC_LIB}" + INTERFACE_INCLUDE_DIRECTORIES "${ZSTD_INCLUDE_DIR}") + endif() + if(NOT TARGET zstd::libzstd_shared AND NOT ZSTD_STATIC_LIB) + add_library(zstd::libzstd_shared SHARED IMPORTED) + set_target_properties(zstd::libzstd_shared + PROPERTIES IMPORTED_LOCATION "${ZSTD_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES "${ZSTD_INCLUDE_DIR}") + endif() +endif() diff --git a/cmake_modules/ThirdpartyToolchain.cmake b/cmake_modules/ThirdpartyToolchain.cmake index ec33193d79..c494710ba1 100644 --- a/cmake_modules/ThirdpartyToolchain.cmake +++ b/cmake_modules/ThirdpartyToolchain.cmake @@ -15,13 +15,18 @@ # specific language governing permissions and limitations # under the License. -set(ORC_FORMAT_VERSION "1.0.0") -set(LZ4_VERSION "1.9.3") -set(SNAPPY_VERSION "1.1.7") -set(ZLIB_VERSION "1.2.11") +set(ORC_VENDOR_DEPENDENCIES) +set(ORC_SYSTEM_DEPENDENCIES) +set(ORC_INSTALL_INTERFACE_TARGETS) + +set(ORC_FORMAT_VERSION "1.1.0") +set(LZ4_VERSION "1.10.0") +set(SNAPPY_VERSION "1.2.2") +set(ZLIB_VERSION "1.3.1") set(GTEST_VERSION "1.12.1") set(PROTOBUF_VERSION "3.5.1") -set(ZSTD_VERSION "1.5.5") +set(ZSTD_VERSION "1.5.7") +set(SPARSEHASH_VERSION "2.11.1") option(ORC_PREFER_STATIC_PROTOBUF "Prefer static protobuf library, if available" ON) option(ORC_PREFER_STATIC_SNAPPY "Prefer static snappy library, if available" ON) @@ -33,7 +38,7 @@ option(ORC_PREFER_STATIC_GMOCK "Prefer static gmock library, if available" # zstd requires us to add the threads FIND_PACKAGE(Threads REQUIRED) -set(THIRDPARTY_DIR "${CMAKE_BINARY_DIR}/c++/libs/thirdparty") +set(THIRDPARTY_DIR "${PROJECT_BINARY_DIR}/c++/libs/thirdparty") set(THIRDPARTY_LOG_OPTIONS LOG_CONFIGURE 1 LOG_BUILD 1 LOG_INSTALL 1 @@ -47,34 +52,101 @@ string(TOUPPER ${CMAKE_BUILD_TYPE} UPPERCASE_BUILD_TYPE) if (DEFINED ENV{SNAPPY_HOME}) set (SNAPPY_HOME "$ENV{SNAPPY_HOME}") +elseif (Snappy_ROOT) + set (SNAPPY_HOME "${Snappy_ROOT}") +elseif (DEFINED ENV{Snappy_ROOT}) + set (SNAPPY_HOME "$ENV{Snappy_ROOT}") +elseif (SNAPPY_ROOT) + set (SNAPPY_HOME "${SNAPPY_ROOT}") +elseif (DEFINED ENV{SNAPPY_ROOT}) + set (SNAPPY_HOME "$ENV{SNAPPY_ROOT}") endif () if (DEFINED ENV{ZLIB_HOME}) set (ZLIB_HOME "$ENV{ZLIB_HOME}") +elseif (ZLIB_ROOT) + set (ZLIB_HOME "${ZLIB_ROOT}") +elseif (DEFINED ENV{ZLIB_ROOT}) + set (ZLIB_HOME "$ENV{ZLIB_ROOT}") endif () if (DEFINED ENV{LZ4_HOME}) set (LZ4_HOME "$ENV{LZ4_HOME}") +elseif (LZ4_ROOT) + set (LZ4_HOME "${LZ4_ROOT}") +elseif (DEFINED ENV{LZ4_ROOT}) + set (LZ4_HOME "$ENV{LZ4_ROOT}") endif () if (DEFINED ENV{PROTOBUF_HOME}) set (PROTOBUF_HOME "$ENV{PROTOBUF_HOME}") +elseif (Protobuf_ROOT) + set (PROTOBUF_HOME "${Protobuf_ROOT}") +elseif (DEFINED ENV{Protobuf_ROOT}) + set (PROTOBUF_HOME "$ENV{Protobuf_ROOT}") +elseif (PROTOBUF_ROOT) + set (PROTOBUF_HOME "${PROTOBUF_ROOT}") +elseif (DEFINED ENV{PROTOBUF_ROOT}) + set (PROTOBUF_HOME "$ENV{PROTOBUF_ROOT}") endif () if (DEFINED ENV{ZSTD_HOME}) set (ZSTD_HOME "$ENV{ZSTD_HOME}") +elseif (ZSTD_ROOT) + set (ZSTD_HOME "${ZSTD_ROOT}") +elseif (DEFINED ENV{ZSTD_ROOT}) + set (ZSTD_HOME "$ENV{ZSTD_ROOT}") endif () if (DEFINED ENV{GTEST_HOME}) set (GTEST_HOME "$ENV{GTEST_HOME}") endif () +# ---------------------------------------------------------------------- +# Macros for adding third-party libraries +macro (orc_add_resolved_library target_name link_lib include_dir) + add_library (${target_name} INTERFACE IMPORTED GLOBAL) + target_link_libraries (${target_name} INTERFACE ${link_lib}) + target_include_directories (${target_name} SYSTEM INTERFACE ${include_dir}) +endmacro () + +macro (orc_add_built_library external_project_name target_name link_lib include_dir) + file (MAKE_DIRECTORY "${include_dir}") + + add_library (${target_name} STATIC IMPORTED) + set_target_properties (${target_name} PROPERTIES IMPORTED_LOCATION "${link_lib}") + target_include_directories (${target_name} BEFORE INTERFACE "${include_dir}") + + add_dependencies (${target_name} ${external_project_name}) + if (INSTALL_VENDORED_LIBS) + install (FILES "${link_lib}" DESTINATION "${CMAKE_INSTALL_LIBDIR}") + endif () +endmacro () + +function(orc_provide_cmake_module MODULE_NAME) + set(module "${PROJECT_SOURCE_DIR}/cmake_modules/${MODULE_NAME}.cmake") + if(EXISTS "${module}") + message(STATUS "Providing CMake module for ${MODULE_NAME} as part of CMake package") + install(FILES "${module}" DESTINATION "${ORC_INSTALL_CMAKE_DIR}") + endif() +endfunction() + +function(orc_provide_find_module PACKAGE_NAME) + orc_provide_cmake_module("Find${PACKAGE_NAME}") +endfunction() + # ---------------------------------------------------------------------- # ORC Format +if(DEFINED ENV{ORC_FORMAT_URL}) + set(ORC_FORMAT_SOURCE_URL "$ENV{ORC_FORMAT_URL}") + message(STATUS "Using ORC_FORMAT_URL: ${ORC_FORMAT_SOURCE_URL}") +else() + set(ORC_FORMAT_SOURCE_URL "https://www.apache.org/dyn/closer.lua/orc/orc-format-$%7BORC_FORMAT_VERSION%7D/orc-format-$%7BORC_FORMAT_VERSION%7D.tar.gz?action=download" ) + message(STATUS "Using DEFAULT URL: ${ORC_FORMAT_SOURCE_URL}") +endif() ExternalProject_Add (orc-format_ep - URL "https://dlcdn.apache.org/orc/orc-format-$%7BORC_FORMAT_VERSION%7D/orc-format-$%7BORC_FORMAT_VERSION%7D.tar.gz" - URL "https://archive.apache.org/dist/orc/orc-format-$%7BORC_FORMAT_VERSION%7D/orc-format-$%7BORC_FORMAT_VERSION%7D.tar.gz" - URL_HASH SHA256=739fae5ff94b1f812b413077280361045bf92e510ef04b34a610e23a945d8cd5 + URL ${ORC_FORMAT_SOURCE_URL} + URL_HASH SHA256=d4a7ac76c5442abf7119e2cb84e71b677e075aff53518aa866055e2ead0450d7 CONFIGURE_COMMAND "" BUILD_COMMAND "" INSTALL_COMMAND "" @@ -83,16 +155,36 @@ ExternalProject_Add (orc-format_ep # ---------------------------------------------------------------------- # Snappy - -if (NOT "${SNAPPY_HOME}" STREQUAL "" OR ORC_PACKAGE_KIND STREQUAL "conan") +if (ORC_PACKAGE_KIND STREQUAL "conan") + find_package (Snappy REQUIRED CONFIG) + add_library (orc_snappy INTERFACE) + target_link_libraries(orc_snappy INTERFACE Snappy::snappy) + list (APPEND ORC_SYSTEM_DEPENDENCIES Snappy) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") +elseif (ORC_PACKAGE_KIND STREQUAL "vcpkg") + find_package(Snappy CONFIG REQUIRED) + add_library (orc_snappy INTERFACE IMPORTED) + target_link_libraries(orc_snappy INTERFACE Snappy::snappy) + list (APPEND ORC_SYSTEM_DEPENDENCIES Snappy) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") +elseif (NOT "${SNAPPY_HOME}" STREQUAL "") find_package (Snappy REQUIRED) - set(SNAPPY_VENDORED FALSE) + if (ORC_PREFER_STATIC_SNAPPY AND SNAPPY_STATIC_LIB) + orc_add_resolved_library (orc_snappy ${SNAPPY_STATIC_LIB} ${SNAPPY_INCLUDE_DIR}) + else () + orc_add_resolved_library (orc_snappy ${SNAPPY_LIBRARY} ${SNAPPY_INCLUDE_DIR}) + endif () + list (APPEND ORC_SYSTEM_DEPENDENCIES Snappy) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") + orc_provide_find_module (Snappy) else () set(SNAPPY_HOME "${THIRDPARTY_DIR}/snappy_ep-install") set(SNAPPY_INCLUDE_DIR "${SNAPPY_HOME}/include") - set(SNAPPY_STATIC_LIB "${SNAPPY_HOME}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}snappy${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(SNAPPY_STATIC_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}snappy${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(SNAPPY_STATIC_LIB "${SNAPPY_HOME}/lib/${SNAPPY_STATIC_LIB_NAME}") set(SNAPPY_CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${SNAPPY_HOME} - -DBUILD_SHARED_LIBS=OFF -DCMAKE_INSTALL_LIBDIR=lib) + -DBUILD_SHARED_LIBS=OFF -DCMAKE_INSTALL_LIBDIR=lib + -DSNAPPY_BUILD_BENCHMARKS=OFF) if (BUILD_POSITION_INDEPENDENT_LIB) set(SNAPPY_CMAKE_ARGS ${SNAPPY_CMAKE_ARGS} -DCMAKE_POSITION_INDEPENDENT_CODE=ON) @@ -104,39 +196,39 @@ else () ${THIRDPARTY_LOG_OPTIONS} BUILD_BYPRODUCTS "${SNAPPY_STATIC_LIB}") - set(SNAPPY_LIBRARY ${SNAPPY_STATIC_LIB}) - set(SNAPPY_VENDORED TRUE) -endif () + orc_add_built_library (snappy_ep orc_snappy ${SNAPPY_STATIC_LIB} ${SNAPPY_INCLUDE_DIR}) -add_library (orc_snappy INTERFACE) -add_library (orc::snappy ALIAS orc_snappy) -if (ORC_PACKAGE_KIND STREQUAL "conan") - target_link_libraries(orc_snappy INTERFACE ${Snappy_LIBRARIES}) -elseif (ORC_PREFER_STATIC_SNAPPY AND ${SNAPPY_STATIC_LIB}) - target_link_libraries(orc_snappy INTERFACE ${SNAPPY_STATIC_LIB}) -else () - target_link_libraries(orc_snappy INTERFACE ${SNAPPY_LIBRARY}) -endif () -if (ORC_PACKAGE_KIND STREQUAL "conan") - target_include_directories (orc_snappy SYSTEM INTERFACE ${Snappy_INCLUDE_DIR}) -else() - target_include_directories (orc_snappy SYSTEM INTERFACE ${SNAPPY_INCLUDE_DIR}) + list (APPEND ORC_VENDOR_DEPENDENCIES "orc::vendored_snappy|${SNAPPY_STATIC_LIB_NAME}") + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") endif () -if (SNAPPY_VENDORED) - add_dependencies (orc_snappy snappy_ep) - if (INSTALL_VENDORED_LIBS) - install(FILES "${SNAPPY_STATIC_LIB}" - DESTINATION "lib") - endif () -endif () +add_library (orc::snappy ALIAS orc_snappy) # ---------------------------------------------------------------------- # ZLIB -if (NOT "${ZLIB_HOME}" STREQUAL "" OR ORC_PACKAGE_KIND STREQUAL "conan") +if (ORC_PACKAGE_KIND STREQUAL "conan") + find_package (ZLIB REQUIRED CONFIG) + add_library (orc_zlib INTERFACE) + target_link_libraries(orc_zlib INTERFACE ZLIB::ZLIB) + list (APPEND ORC_SYSTEM_DEPENDENCIES ZLIB) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") +elseif (ORC_PACKAGE_KIND STREQUAL "vcpkg") + find_package(ZLIB REQUIRED) + add_library (orc_zlib INTERFACE IMPORTED) + target_link_libraries(orc_zlib INTERFACE ZLIB::ZLIB) + list (APPEND ORC_SYSTEM_DEPENDENCIES ZLIB) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") +elseif (NOT "${ZLIB_HOME}" STREQUAL "") find_package (ZLIB REQUIRED) - set(ZLIB_VENDORED FALSE) + if (ORC_PREFER_STATIC_ZLIB AND ZLIB_STATIC_LIB) + orc_add_resolved_library (orc_zlib ${ZLIB_STATIC_LIB} ${ZLIB_INCLUDE_DIR}) + else () + orc_add_resolved_library (orc_zlib ${ZLIB_LIBRARY} ${ZLIB_INCLUDE_DIR}) + endif () + list (APPEND ORC_SYSTEM_DEPENDENCIES ZLIB) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") + orc_provide_find_module (ZLIB) else () set(ZLIB_PREFIX "${THIRDPARTY_DIR}/zlib_ep-install") set(ZLIB_INCLUDE_DIR "${ZLIB_PREFIX}/include") @@ -148,7 +240,8 @@ else () else () set(ZLIB_STATIC_LIB_NAME z) endif () - set(ZLIB_STATIC_LIB "${ZLIB_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}${ZLIB_STATIC_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(ZLIB_STATIC_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}${ZLIB_STATIC_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(ZLIB_STATIC_LIB "${ZLIB_PREFIX}/lib/${ZLIB_STATIC_LIB_NAME}") set(ZLIB_CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${ZLIB_PREFIX} -DBUILD_SHARED_LIBS=OFF) @@ -162,35 +255,43 @@ else () ${THIRDPARTY_LOG_OPTIONS} BUILD_BYPRODUCTS "${ZLIB_STATIC_LIB}") - set(ZLIB_LIBRARY ${ZLIB_STATIC_LIB}) - set(ZLIB_VENDORED TRUE) -endif () + orc_add_built_library (zlib_ep orc_zlib ${ZLIB_STATIC_LIB} ${ZLIB_INCLUDE_DIR}) -add_library (orc_zlib INTERFACE) -add_library (orc::zlib ALIAS orc_zlib) -if (ORC_PACKAGE_KIND STREQUAL "conan") - target_link_libraries (orc_zlib INTERFACE ${ZLIB_LIBRARIES}) -elseif (ORC_PREFER_STATIC_ZLIB AND ${ZLIB_STATIC_LIB}) - target_link_libraries (orc_zlib INTERFACE ${ZLIB_STATIC_LIB}) -else () - target_link_libraries (orc_zlib INTERFACE ${ZLIB_LIBRARY}) + list (APPEND ORC_VENDOR_DEPENDENCIES "orc::vendored_zlib|${ZLIB_STATIC_LIB_NAME}") + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") endif () -target_include_directories (orc_zlib SYSTEM INTERFACE ${ZLIB_INCLUDE_DIR}) -if (ZLIB_VENDORED) - add_dependencies (orc_zlib zlib_ep) - if (INSTALL_VENDORED_LIBS) - install(FILES "${ZLIB_STATIC_LIB}" - DESTINATION "lib") - endif () -endif () +add_library (orc::zlib ALIAS orc_zlib) # ---------------------------------------------------------------------- # Zstd -if (NOT "${ZSTD_HOME}" STREQUAL "" OR ORC_PACKAGE_KIND STREQUAL "conan") +if (ORC_PACKAGE_KIND STREQUAL "conan") + find_package (ZSTD REQUIRED CONFIG) + add_library (orc_zstd INTERFACE) + target_link_libraries (orc_zstd INTERFACE + $ + $ + ) + list (APPEND ORC_SYSTEM_DEPENDENCIES ZSTD) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$,zstd::libzstd_shared,zstd::libzstd_static>>") +elseif (ORC_PACKAGE_KIND STREQUAL "vcpkg") + find_package(zstd CONFIG REQUIRED) + add_library(orc_zstd INTERFACE) + target_link_libraries(orc_zstd INTERFACE $,zstd::libzstd_shared,zstd::libzstd_static>) + list(APPEND ORC_SYSTEM_DEPENDENCIES zstd) + list(APPEND ORC_INSTALL_INTERFACE_TARGETS "$,zstd::libzstd_shared,zstd::libzstd_static>>") +elseif (NOT "${ZSTD_HOME}" STREQUAL "") find_package (ZSTD REQUIRED) - set(ZSTD_VENDORED FALSE) + if (ORC_PREFER_STATIC_ZSTD AND ZSTD_STATIC_LIB) + orc_add_resolved_library (orc_zstd ${ZSTD_STATIC_LIB} ${ZSTD_INCLUDE_DIR}) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") + else () + orc_add_resolved_library (orc_zstd ${ZSTD_LIBRARY} ${ZSTD_INCLUDE_DIR}) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$,zstd::libzstd_shared,zstd::libzstd_static>>") + endif () + list (APPEND ORC_SYSTEM_DEPENDENCIES ZSTD) + orc_provide_find_module (ZSTD) else () set(ZSTD_HOME "${THIRDPARTY_DIR}/zstd_ep-install") set(ZSTD_INCLUDE_DIR "${ZSTD_HOME}/include") @@ -202,7 +303,8 @@ else () else () set(ZSTD_STATIC_LIB_NAME zstd) endif () - set(ZSTD_STATIC_LIB "${ZSTD_HOME}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}${ZSTD_STATIC_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(ZSTD_STATIC_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}${ZSTD_STATIC_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(ZSTD_STATIC_LIB "${ZSTD_HOME}/lib/${ZSTD_STATIC_LIB_NAME}") set(ZSTD_CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${ZSTD_HOME} -DBUILD_SHARED_LIBS=OFF -DCMAKE_INSTALL_LIBDIR=lib) @@ -223,43 +325,46 @@ else () ${THIRDPARTY_LOG_OPTIONS} BUILD_BYPRODUCTS ${ZSTD_STATIC_LIB}) - set(ZSTD_LIBRARY ${ZSTD_STATIC_LIB}) - set(ZSTD_VENDORED TRUE) -endif () + orc_add_built_library (zstd_ep orc_zstd ${ZSTD_STATIC_LIB} ${ZSTD_INCLUDE_DIR}) -add_library (orc_zstd INTERFACE) -add_library (orc::zstd ALIAS orc_zstd) -if (ORC_PACKAGE_KIND STREQUAL "conan") - target_link_libraries (orc_zstd INTERFACE ${zstd_LIBRARIES}) -elseif (ORC_PREFER_STATIC_ZSTD AND ${ZSTD_STATIC_LIB}) - target_link_libraries (orc_zstd INTERFACE ${ZSTD_STATIC_LIB}) -else () - target_link_libraries (orc_zstd INTERFACE ${ZSTD_LIBRARY}) -endif () -if (ORC_PACKAGE_KIND STREQUAL "conan") - target_include_directories (orc_zstd SYSTEM INTERFACE ${zstd_INCLUDE_DIR}) -else() - target_include_directories (orc_zstd SYSTEM INTERFACE ${ZSTD_INCLUDE_DIR}) + list (APPEND ORC_VENDOR_DEPENDENCIES "orc::vendored_zstd|${ZSTD_STATIC_LIB_NAME}") + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") endif () -if (ZSTD_VENDORED) - add_dependencies (orc_zstd zstd_ep) - if (INSTALL_VENDORED_LIBS) - install(FILES "${ZSTD_STATIC_LIB}" - DESTINATION "lib") - endif () -endif () +add_library (orc::zstd ALIAS orc_zstd) # ---------------------------------------------------------------------- # LZ4 - -if (NOT "${LZ4_HOME}" STREQUAL "" OR ORC_PACKAGE_KIND STREQUAL "conan") +if (ORC_PACKAGE_KIND STREQUAL "conan") + find_package (LZ4 REQUIRED CONFIG) + add_library (orc_lz4 INTERFACE) + target_link_libraries (orc_lz4 INTERFACE + $ + $ + ) + list (APPEND ORC_SYSTEM_DEPENDENCIES LZ4) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$,LZ4::lz4_shared,LZ4::lz4_static>>") +elseif (ORC_PACKAGE_KIND STREQUAL "vcpkg") + find_package(lz4 CONFIG REQUIRED) + add_library (orc_lz4 INTERFACE IMPORTED) + target_link_libraries(orc_lz4 INTERFACE LZ4::lz4) + list (APPEND ORC_SYSTEM_DEPENDENCIES lz4) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") +elseif (NOT "${LZ4_HOME}" STREQUAL "") find_package (LZ4 REQUIRED) - set(LZ4_VENDORED FALSE) + if (ORC_PREFER_STATIC_LZ4 AND LZ4_STATIC_LIB) + orc_add_resolved_library (orc_lz4 ${LZ4_STATIC_LIB} ${LZ4_INCLUDE_DIR}) + else () + orc_add_resolved_library (orc_lz4 ${LZ4_LIBRARY} ${LZ4_INCLUDE_DIR}) + endif () + list (APPEND ORC_SYSTEM_DEPENDENCIES LZ4) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") + orc_provide_find_module (LZ4) else () set(LZ4_PREFIX "${THIRDPARTY_DIR}/lz4_ep-install") set(LZ4_INCLUDE_DIR "${LZ4_PREFIX}/include") - set(LZ4_STATIC_LIB "${LZ4_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}lz4${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(LZ4_STATIC_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}lz4${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(LZ4_STATIC_LIB "${LZ4_PREFIX}/lib/${LZ4_STATIC_LIB_NAME}") set(LZ4_CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${LZ4_PREFIX} -DCMAKE_INSTALL_LIBDIR=lib -DBUILD_SHARED_LIBS=OFF) @@ -281,32 +386,13 @@ else () ${THIRDPARTY_LOG_OPTIONS} BUILD_BYPRODUCTS ${LZ4_STATIC_LIB}) - set(LZ4_LIBRARY ${LZ4_STATIC_LIB}) - set(LZ4_VENDORED TRUE) -endif () + orc_add_built_library (lz4_ep orc_lz4 ${LZ4_STATIC_LIB} ${LZ4_INCLUDE_DIR}) -add_library (orc_lz4 INTERFACE) -add_library (orc::lz4 ALIAS orc_lz4) -if (ORC_PACKAGE_KIND STREQUAL "conan") - target_link_libraries (orc_lz4 INTERFACE ${lz4_LIBRARIES}) -elseif (ORC_PREFER_STATIC_LZ4 AND ${LZ4_STATIC_LIB}) - target_link_libraries (orc_lz4 INTERFACE ${LZ4_STATIC_LIB}) -else () - target_link_libraries (orc_lz4 INTERFACE ${LZ4_LIBRARY}) -endif () -if (ORC_PACKAGE_KIND STREQUAL "conan") - target_include_directories (orc_lz4 SYSTEM INTERFACE ${lz4_INCLUDE_DIR}) -else() - target_include_directories (orc_lz4 SYSTEM INTERFACE ${LZ4_INCLUDE_DIR}) + list (APPEND ORC_VENDOR_DEPENDENCIES "orc::vendored_lz4|${LZ4_STATIC_LIB_NAME}") + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") endif () -if (LZ4_VENDORED) - add_dependencies (orc_lz4 lz4_ep) - if (INSTALL_VENDORED_LIBS) - install(FILES "${LZ4_STATIC_LIB}" - DESTINATION "lib") - endif () -endif () +add_library (orc::lz4 ALIAS orc_lz4) # ---------------------------------------------------------------------- # IANA - Time Zone Database @@ -390,7 +476,7 @@ if (BUILD_CPP_TESTS) add_library (orc::gmock ALIAS orc_gmock) add_library (orc_gtest INTERFACE) add_library (orc::gtest ALIAS orc_gtest) - if (ORC_PREFER_STATIC_GMOCK AND ${GMOCK_STATIC_LIB}) + if (ORC_PREFER_STATIC_GMOCK AND GMOCK_STATIC_LIB) target_link_libraries (orc_gmock INTERFACE ${GMOCK_STATIC_LIB}) target_link_libraries (orc_gtest INTERFACE ${GTEST_STATIC_LIB}) else () @@ -414,14 +500,43 @@ endif () # ---------------------------------------------------------------------- # Protobuf -if (NOT "${PROTOBUF_HOME}" STREQUAL "" OR ORC_PACKAGE_KIND STREQUAL "conan") +if (ORC_PACKAGE_KIND STREQUAL "conan") + find_package (Protobuf REQUIRED CONFIG) + add_library (orc_protobuf INTERFACE) + target_link_libraries(orc_protobuf INTERFACE protobuf::protobuf) + list (APPEND ORC_SYSTEM_DEPENDENCIES Protobuf) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") +elseif (ORC_PACKAGE_KIND STREQUAL "vcpkg") + find_package(Protobuf CONFIG REQUIRED) + add_library (orc_protobuf INTERFACE IMPORTED) + target_link_libraries(orc_protobuf INTERFACE protobuf::libprotobuf) + list (APPEND ORC_SYSTEM_DEPENDENCIES Protobuf) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") + set (PROTOBUF_EXECUTABLE protobuf::protoc) +elseif (NOT "${PROTOBUF_HOME}" STREQUAL "") find_package (Protobuf REQUIRED) - set(PROTOBUF_VENDORED FALSE) + + if (ORC_PREFER_STATIC_PROTOBUF AND PROTOBUF_STATIC_LIB) + orc_add_resolved_library (orc_protobuf ${PROTOBUF_STATIC_LIB} ${PROTOBUF_INCLUDE_DIR}) + else () + orc_add_resolved_library (orc_protobuf ${PROTOBUF_LIBRARY} ${PROTOBUF_INCLUDE_DIR}) + endif () + + if (ORC_PREFER_STATIC_PROTOBUF AND PROTOC_STATIC_LIB) + orc_add_resolved_library (orc_protoc ${PROTOC_STATIC_LIB} ${PROTOBUF_INCLUDE_DIR}) + else () + orc_add_resolved_library (orc_protoc ${PROTOC_LIBRARY} ${PROTOBUF_INCLUDE_DIR}) + endif () + + list (APPEND ORC_SYSTEM_DEPENDENCIES Protobuf) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") + orc_provide_find_module (Protobuf) else () set(PROTOBUF_PREFIX "${THIRDPARTY_DIR}/protobuf_ep-install") set(PROTOBUF_INCLUDE_DIR "${PROTOBUF_PREFIX}/include") set(PROTOBUF_CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${PROTOBUF_PREFIX} -DCMAKE_INSTALL_LIBDIR=lib + -DCMAKE_POLICY_VERSION_MINIMUM=3.12 -DBUILD_SHARED_LIBS=OFF -Dprotobuf_BUILD_TESTS=OFF) @@ -436,7 +551,8 @@ else () else () set(PROTOBUF_STATIC_LIB_PREFIX ${CMAKE_STATIC_LIBRARY_PREFIX}) endif () - set(PROTOBUF_STATIC_LIB "${PROTOBUF_PREFIX}/lib/${PROTOBUF_STATIC_LIB_PREFIX}protobuf${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(PROTOBUF_STATIC_LIB_NAME "${PROTOBUF_STATIC_LIB_PREFIX}protobuf${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(PROTOBUF_STATIC_LIB "${PROTOBUF_PREFIX}/lib/${PROTOBUF_STATIC_LIB_NAME}") set(PROTOC_STATIC_LIB "${PROTOBUF_PREFIX}/lib/${PROTOBUF_STATIC_LIB_PREFIX}protoc${CMAKE_STATIC_LIBRARY_SUFFIX}") set(PROTOBUF_EXECUTABLE "${PROTOBUF_PREFIX}/bin/protoc${CMAKE_EXECUTABLE_SUFFIX}") @@ -453,46 +569,56 @@ else () ${THIRDPARTY_LOG_OPTIONS} BUILD_BYPRODUCTS "${PROTOBUF_STATIC_LIB}" "${PROTOC_STATIC_LIB}") - set(PROTOBUF_LIBRARY ${PROTOBUF_STATIC_LIB}) - set(PROTOC_LIBRARY ${PROTOC_STATIC_LIB}) - set(PROTOBUF_VENDORED TRUE) + orc_add_built_library (protobuf_ep orc_protobuf ${PROTOBUF_STATIC_LIB} ${PROTOBUF_INCLUDE_DIR}) + orc_add_built_library (protobuf_ep orc_protoc ${PROTOC_STATIC_LIB} ${PROTOBUF_INCLUDE_DIR}) + + list (APPEND ORC_VENDOR_DEPENDENCIES "orc::vendored_protobuf|${PROTOBUF_STATIC_LIB_NAME}") + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") endif () -add_library (orc_protobuf INTERFACE) add_library (orc::protobuf ALIAS orc_protobuf) -add_library (orc_protoc INTERFACE) -add_library (orc::protoc ALIAS orc_protoc) +if (NOT (ORC_PACKAGE_KIND STREQUAL "conan" OR ORC_PACKAGE_KIND STREQUAL "vcpkg")) + add_library (orc::protoc ALIAS orc_protoc) +endif () -if (ORC_PACKAGE_KIND STREQUAL "conan") - target_link_libraries (orc_protobuf INTERFACE ${protobuf_LIBRARIES}) -elseif (ORC_PREFER_STATIC_PROTOBUF AND ${PROTOBUF_STATIC_LIB}) - target_link_libraries (orc_protobuf INTERFACE ${PROTOBUF_STATIC_LIB}) -else () - target_link_libraries (orc_protobuf INTERFACE ${PROTOBUF_LIBRARY}) -endif() -if (ORC_PACKAGE_KIND STREQUAL "conan") - target_include_directories (orc_protobuf SYSTEM INTERFACE ${protobuf_INCLUDE_DIR}) -else () - target_include_directories (orc_protobuf SYSTEM INTERFACE ${PROTOBUF_INCLUDE_DIR}) +# ---------------------------------------------------------------------- +# SPARSEHASH + +set(SPARSEHASH_HOME "${THIRDPARTY_DIR}/sparsehash_ep-install") +set(SPARSEHASH_INCLUDE_DIR "${SPARSEHASH_HOME}/include/google") +set(SPARSEHASH_CMAKE_ARGS + -DCMAKE_INSTALL_PREFIX=${SPARSEHASH_HOME} + -DBUILD_SHARED_LIBS=OFF + -DCMAKE_INSTALL_LIBDIR=lib + -DCMAKE_POLICY_VERSION_MINIMUM=3.5 +) +if (BUILD_POSITION_INDEPENDENT_LIB) + set(SPARSEHASH_CMAKE_ARGS ${SPARSEHASH_CMAKE_ARGS} -DCMAKE_POSITION_INDEPENDENT_CODE=ON) endif () -if (NOT ORC_PACKAGE_KIND STREQUAL "conan") - if (ORC_PREFER_STATIC_PROTOBUF AND ${PROTOC_STATIC_LIB}) - target_link_libraries (orc_protoc INTERFACE ${PROTOC_STATIC_LIB}) - else () - target_link_libraries (orc_protoc INTERFACE ${PROTOC_LIBRARY}) - endif() - target_include_directories (orc_protoc SYSTEM INTERFACE ${PROTOBUF_INCLUDE_DIR}) +if (CMAKE_VERSION VERSION_GREATER "3.7") + set(SPARSEHASH_CONFIGURE SOURCE_SUBDIR "" CMAKE_ARGS ${SPARSEHASH_CMAKE_ARGS}) + else() + set(SPARSEHASH_CONFIGURE CONFIGURE_COMMAND "${THIRDPARTY_CONFIGURE_COMMAND}" ${SPARSEHASH_CMAKE_ARGS} + "${CMAKE_CURRENT_BINARY_DIR}/sparsehash_ep-prefix/src/sparsehash_ep/") endif() -if (PROTOBUF_VENDORED) - add_dependencies (orc_protoc protobuf_ep) - add_dependencies (orc_protobuf protobuf_ep) - if (INSTALL_VENDORED_LIBS) - install(FILES "${PROTOBUF_STATIC_LIB}" "${PROTOC_STATIC_LIB}" - DESTINATION "lib") - endif () -endif () +ExternalProject_Add(sparsehash_ep + URL "https://github.com/sparsehash/sparsehash-c11/archive/refs/tags/v$%7BSPARSEHASH_VERSION%7D.tar.gz" + ${SPARSEHASH_CONFIGURE} + ${THIRDPARTY_LOG_OPTIONS}) + +# sparsehash-c11 is header-only, create interface library +add_library(orc_sparsehash INTERFACE) +target_include_directories(orc_sparsehash INTERFACE + $ + $) +add_dependencies(orc_sparsehash sparsehash_ep) + +list (APPEND ORC_VENDOR_DEPENDENCIES "orc::vendored_sparsehash") +list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") + +add_library (orc::sparsehash ALIAS orc_sparsehash) # ---------------------------------------------------------------------- # LIBHDFSPP @@ -509,7 +635,7 @@ if(BUILD_LIBHDFSPP) set (LIBHDFSPP_INCLUDE_DIR "${LIBHDFSPP_PREFIX}/include") set (LIBHDFSPP_STATIC_LIB_NAME hdfspp_static) set (LIBHDFSPP_STATIC_LIB "${LIBHDFSPP_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}${LIBHDFSPP_STATIC_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}") - set (LIBHDFSPP_SRC_URL "${CMAKE_SOURCE_DIR}/c++/libs/libhdfspp/libhdfspp.tar.gz") + set (LIBHDFSPP_SRC_URL "${PROJECT_SOURCE_DIR}/c++/libs/libhdfspp/libhdfspp.tar.gz") set (LIBHDFSPP_CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${LIBHDFSPP_PREFIX} -DPROTOBUF_INCLUDE_DIR=${PROTOBUF_INCLUDE_DIR} @@ -536,15 +662,7 @@ if(BUILD_LIBHDFSPP) BUILD_BYPRODUCTS "${LIBHDFSPP_STATIC_LIB}" CMAKE_ARGS ${LIBHDFSPP_CMAKE_ARGS}) - include_directories (SYSTEM ${LIBHDFSPP_INCLUDE_DIR}) - - add_library (libhdfspp STATIC IMPORTED) - set_target_properties (libhdfspp PROPERTIES IMPORTED_LOCATION ${LIBHDFSPP_STATIC_LIB}) - add_dependencies (libhdfspp libhdfspp_ep) - if (INSTALL_VENDORED_LIBS) - install(FILES "${LIBHDFSPP_STATIC_LIB}" - DESTINATION "lib") - endif () + orc_add_built_library(libhdfspp_ep libhdfspp ${LIBHDFSPP_STATIC_LIB} ${LIBHDFSPP_INCLUDE_DIR}) set (LIBHDFSPP_LIBRARIES libhdfspp diff --git a/docker/README.md b/docker/README.md index e9a3b65b12..2247cea966 100644 --- a/docker/README.md +++ b/docker/README.md @@ -1,9 +1,12 @@ +# Docker Test + ## Supported OSes * Debian 11 and 12 * Fedora 37 -* Ubuntu 20, 22, 24 +* Ubuntu 22 and 24 * Oracle Linux 9 +* Amazon Linux 2023 ## Pre-built Images @@ -25,6 +28,7 @@ To test against all of the Linux OSes against Apache's main branch: Using `local` as the owner will cause the scripts to use the local repository. The scripts are: + * `run-all.sh` *owner* *branch* - test the given owner's branch on all OSes * `run-one.sh` *owner* *branch* *os* - test the owner's branch on one OS * `reinit.sh` - rebuild all of the base images without the image cache diff --git a/docker/ubuntu20/Dockerfile b/docker/amazonlinux23/Dockerfile similarity index 60% rename from docker/ubuntu20/Dockerfile rename to docker/amazonlinux23/Dockerfile index 59a487bb8d..806a58f898 100644 --- a/docker/ubuntu20/Dockerfile +++ b/docker/amazonlinux23/Dockerfile @@ -14,43 +14,35 @@ # See the License for the specific language governing permissions and # limitations under the License. -# ORC compile for Ubuntu 20 +# ORC compile for Amazon Linux 2023 # -FROM ubuntu:20.04 -LABEL maintainer="Apache ORC project " -ARG jdk=17 -ARG cc=gcc +FROM amazonlinux:2023 +LABEL org.opencontainers.image.authors="Apache ORC project " +LABEL org.opencontainers.image.licenses="Apache-2.0" +LABEL org.opencontainers.image.ref.name="Apache ORC on Amazon Linux 2023" +LABEL org.opencontainers.image.version="" -RUN ln -fs /usr/share/zoneinfo/America/Los_Angeles /etc/localtime -RUN apt-get update -RUN apt-get install -y \ - cmake \ +RUN yum check-update || true +RUN yum install -y \ + cmake3 \ + curl-devel \ + cyrus-sasl-devel \ + expat-devel \ + gcc \ + gcc-c++ \ + gettext-devel \ git \ - libsasl2-dev \ - libssl-dev \ + libtool \ make \ - curl \ - maven \ - openjdk-${jdk}-jdk \ - tzdata; \ - if [ "${cc}" = "gcc" ] ; then \ - apt-get install -y \ - gcc \ - g++ \ - ; else \ - apt-get install -y \ - clang \ - && \ - update-alternatives --set cc /usr/bin/clang && \ - update-alternatives --set c++ /usr/bin/clang++ \ - ; fi -RUN update-alternatives --set java $(update-alternatives --list java | grep ${jdk}) && \ - update-alternatives --set javac $(update-alternatives --list javac | grep ${jdk}) - -ENV CC=cc -ENV CXX=c++ + openssl-devel \ + tar \ + wget \ + which \ + zlib-devel \ + java-17-amazon-corretto-devel +ENV TZ=America/Los_Angeles WORKDIR /root VOLUME /root/.m2/repository diff --git a/docker/debian11/Dockerfile b/docker/debian11/Dockerfile index fb804a316b..7af433de18 100644 --- a/docker/debian11/Dockerfile +++ b/docker/debian11/Dockerfile @@ -18,7 +18,10 @@ # FROM debian:bullseye -LABEL maintainer="Apache ORC project " +LABEL org.opencontainers.image.authors="Apache ORC project " +LABEL org.opencontainers.image.licenses="Apache-2.0" +LABEL org.opencontainers.image.ref.name="Apache ORC on Debian 11" +LABEL org.opencontainers.image.version="" ARG jdk=17 RUN apt-get update diff --git a/docker/debian12/Dockerfile b/docker/debian12/Dockerfile index f0c2a600eb..ae341183f9 100644 --- a/docker/debian12/Dockerfile +++ b/docker/debian12/Dockerfile @@ -18,7 +18,10 @@ # FROM debian:bookworm -LABEL maintainer="Apache ORC project " +LABEL org.opencontainers.image.authors="Apache ORC project " +LABEL org.opencontainers.image.licenses="Apache-2.0" +LABEL org.opencontainers.image.ref.name="Apache ORC on Debian 12" +LABEL org.opencontainers.image.version="" ARG jdk=17 RUN apt-get update diff --git a/docker/fedora37/Dockerfile b/docker/oraclelinux8/Dockerfile similarity index 88% rename from docker/fedora37/Dockerfile rename to docker/oraclelinux8/Dockerfile index bf4a50fc1c..4951f26bec 100644 --- a/docker/fedora37/Dockerfile +++ b/docker/oraclelinux8/Dockerfile @@ -14,11 +14,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -# ORC compile for CentOS 7 +# ORC compile for Oracle Linux 8 # -FROM fedora:37 -LABEL maintainer="Apache ORC project " +FROM oraclelinux:8 +LABEL org.opencontainers.image.authors="Apache ORC project " +LABEL org.opencontainers.image.licenses="Apache-2.0" RUN yum check-update || true RUN yum install -y \ diff --git a/docker/oraclelinux9/Dockerfile b/docker/oraclelinux9/Dockerfile index 094ec828f2..a0f9623490 100644 --- a/docker/oraclelinux9/Dockerfile +++ b/docker/oraclelinux9/Dockerfile @@ -18,7 +18,8 @@ # FROM oraclelinux:9 -LABEL maintainer="Apache ORC project " +LABEL org.opencontainers.image.authors="Apache ORC project " +LABEL org.opencontainers.image.licenses="Apache-2.0" RUN yum check-update || true RUN yum install -y \ diff --git a/docker/os-list.txt b/docker/os-list.txt index 3966df3245..e138aaf493 100644 --- a/docker/os-list.txt +++ b/docker/os-list.txt @@ -1,7 +1,7 @@ debian11 debian12 -ubuntu20 ubuntu22 ubuntu24 -fedora37 +oraclelinux8 oraclelinux9 +amazonlinux23 diff --git a/docker/ubuntu22/Dockerfile b/docker/ubuntu22/Dockerfile index 81f6269518..03863f20a4 100644 --- a/docker/ubuntu22/Dockerfile +++ b/docker/ubuntu22/Dockerfile @@ -18,7 +18,10 @@ # FROM ubuntu:22.04 -LABEL maintainer="Apache ORC project " +LABEL org.opencontainers.image.authors="Apache ORC project " +LABEL org.opencontainers.image.licenses="Apache-2.0" +LABEL org.opencontainers.image.ref.name="Apache ORC on Ubuntu 22" +LABEL org.opencontainers.image.version="" ARG jdk=17 ARG cc=gcc diff --git a/docker/ubuntu24/Dockerfile b/docker/ubuntu24/Dockerfile index 34b3924330..00cd2d67e7 100644 --- a/docker/ubuntu24/Dockerfile +++ b/docker/ubuntu24/Dockerfile @@ -18,7 +18,10 @@ # FROM ubuntu:24.04 -LABEL maintainer="Apache ORC project " +LABEL org.opencontainers.image.authors="Apache ORC project " +LABEL org.opencontainers.image.licenses="Apache-2.0" +LABEL org.opencontainers.image.ref.name="Apache ORC on Ubuntu 24" +LABEL org.opencontainers.image.version="" ARG jdk=21 ARG cc=gcc diff --git a/java/.mvn/jvm.config b/java/.mvn/jvm.config new file mode 100644 index 0000000000..81b88d8173 --- /dev/null +++ b/java/.mvn/jvm.config @@ -0,0 +1 @@ +--enable-native-access=ALL-UNNAMED diff --git a/java/bench/README.md b/java/bench/README.md index 838433567b..7854c5d6f7 100644 --- a/java/bench/README.md +++ b/java/bench/README.md @@ -15,7 +15,7 @@ There are three sub-modules to try to mitigate dependency hell: To build this library, run the following in the parent directory: -``` +```bash % ./mvnw clean package -Pbenchmark -DskipTests % cd bench ``` @@ -57,4 +57,3 @@ To run row-filter benchmark: To run spark benchmark: ```% java -jar spark/target/orc-benchmarks-spark-${ORC_VERSION}.jar spark data``` - diff --git a/java/bench/core/pom.xml b/java/bench/core/pom.xml index cf6fe1ad51..75baa7b69a 100644 --- a/java/bench/core/pom.xml +++ b/java/bench/core/pom.xml @@ -17,7 +17,7 @@ org.apache.orc orc-benchmarks - 2.1.0-SNAPSHOT + 2.3.0-SNAPSHOT .. diff --git a/java/bench/core/src/java/org/apache/orc/bench/core/convert/GenerateVariants.java b/java/bench/core/src/java/org/apache/orc/bench/core/convert/GenerateVariants.java index 0450088d57..0f1e1965cf 100644 --- a/java/bench/core/src/java/org/apache/orc/bench/core/convert/GenerateVariants.java +++ b/java/bench/core/src/java/org/apache/orc/bench/core/convert/GenerateVariants.java @@ -122,6 +122,10 @@ public void run(String[] args) throws Exception { Configuration conf = new Configuration(); // Disable Hadoop checksums conf.set("fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem"); + for (String key: System.getProperties().stringPropertyNames()) { + if (!key.startsWith("orc.")) continue; + conf.set(key, System.getProperty(key)); + } Path root = new Path(cli.getArgs()[0]); for (final String data: dataList) { diff --git a/java/bench/core/src/java/org/apache/orc/bench/core/convert/avro/AvroReader.java b/java/bench/core/src/java/org/apache/orc/bench/core/convert/avro/AvroReader.java index 97b58a8fea..8474351f2b 100644 --- a/java/bench/core/src/java/org/apache/orc/bench/core/convert/avro/AvroReader.java +++ b/java/bench/core/src/java/org/apache/orc/bench/core/convert/avro/AvroReader.java @@ -41,6 +41,7 @@ import org.apache.orc.bench.core.convert.BatchReader; import java.io.IOException; +import java.math.BigInteger; import java.nio.ByteBuffer; import java.util.List; @@ -201,7 +202,14 @@ public void convert(ColumnVector cv, int row, Object value) { cv.isNull[row] = true; } else { DecimalColumnVector tc = (DecimalColumnVector) cv; - tc.vector[row].set(HiveDecimal.create(Math.round((double) value * multiplier))); + if (value instanceof ByteBuffer) { + tc.vector[row].set(getHiveDecimalFromByteBuffer((ByteBuffer) value, scale)); + } else if (value instanceof GenericData.Fixed) { + tc.vector[row].set(getHiveDecimalFromByteBuffer( + ByteBuffer.wrap(((GenericData.Fixed) value).bytes()), scale)); + } else { + tc.vector[row].set(HiveDecimal.create(Math.round((double) value * multiplier))); + } } } } @@ -289,6 +297,13 @@ static AvroConverter createConverter(TypeDescription types) { } } + static HiveDecimal getHiveDecimalFromByteBuffer(ByteBuffer byteBuffer, + int scale) { + byte[] result = getBytesFromByteBuffer(byteBuffer); + HiveDecimal dec = HiveDecimal.create(new BigInteger(result), scale); + return dec; + } + static byte[] getBytesFromByteBuffer(ByteBuffer byteBuffer) { byteBuffer.rewind(); byte[] result = new byte[byteBuffer.limit()]; diff --git a/java/bench/core/src/java/org/apache/orc/bench/core/convert/avro/AvroSchemaUtils.java b/java/bench/core/src/java/org/apache/orc/bench/core/convert/avro/AvroSchemaUtils.java index 96df6b5ba1..65753553a4 100644 --- a/java/bench/core/src/java/org/apache/orc/bench/core/convert/avro/AvroSchemaUtils.java +++ b/java/bench/core/src/java/org/apache/orc/bench/core/convert/avro/AvroSchemaUtils.java @@ -78,8 +78,11 @@ public static Schema createAvroSchema(TypeDescription typeInfo) { case DECIMAL: String precision = String.valueOf(typeInfo.getPrecision()); String scale = String.valueOf(typeInfo.getScale()); + int bytes = PRECISION_TO_BYTE_COUNT[typeInfo.getPrecision() - 1]; schema = getSchemaFor("{" + - "\"type\":\"bytes\"," + + "\"type\":\"fixed\"," + + "\"name\":\"" + typeInfo.getFullFieldName() + "\"," + + "\"size\":" + bytes + "," + "\"logicalType\":\"decimal\"," + "\"precision\":" + precision + "," + "\"scale\":" + scale + "}"); @@ -189,4 +192,16 @@ private static Schema getSchemaFor(String str) { Schema.Parser parser = new Schema.Parser(); return parser.parse(str); } + + // org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe + // Map precision to the number bytes needed for binary conversion. + public static final int[] PRECISION_TO_BYTE_COUNT = new int[38]; + + static { + for (int prec = 1; prec <= 38; prec++) { + // Estimated number of bytes needed. + PRECISION_TO_BYTE_COUNT[prec - 1] = (int) + Math.ceil((Math.log(Math.pow(10, prec) - 1) / Math.log(2) + 1) / 8); + } + } } diff --git a/java/bench/core/src/java/org/apache/orc/bench/core/convert/avro/AvroWriter.java b/java/bench/core/src/java/org/apache/orc/bench/core/convert/avro/AvroWriter.java index d60ef6745d..34fa166673 100644 --- a/java/bench/core/src/java/org/apache/orc/bench/core/convert/avro/AvroWriter.java +++ b/java/bench/core/src/java/org/apache/orc/bench/core/convert/avro/AvroWriter.java @@ -40,7 +40,6 @@ import org.apache.orc.bench.core.convert.BatchWriter; import java.io.IOException; -import java.nio.Buffer; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import java.util.List; @@ -166,8 +165,12 @@ public Object convert(ColumnVector cv, int row) { } private static class DecimalConverter implements AvroConverter { + final Schema avroSchema; + final int precision; final int scale; - DecimalConverter(int scale) { + DecimalConverter(Schema avroSchema, int precision, int scale) { + this.avroSchema = avroSchema; + this.precision = precision; this.scale = scale; } public Object convert(ColumnVector cv, int row) { @@ -176,8 +179,7 @@ public Object convert(ColumnVector cv, int row) { } if (cv.noNulls || !cv.isNull[row]) { DecimalColumnVector vector = (DecimalColumnVector) cv; - return getBufferFromDecimal( - vector.vector[row].getHiveDecimal(), scale); + return decimalToBinary(vector.vector[row].getHiveDecimal(), avroSchema, precision, scale); } else { return null; } @@ -270,7 +272,7 @@ public static AvroConverter createConverter(TypeDescription types, case TIMESTAMP: return new TimestampConverter(); case DECIMAL: - return new DecimalConverter(types.getScale()); + return new DecimalConverter(avroSchema, types.getPrecision(), types.getScale()); case LIST: return new ListConverter(types, avroSchema); case STRUCT: @@ -356,11 +358,28 @@ public void close() throws IOException { writer.close(); } - static Buffer getBufferFromDecimal(HiveDecimal dec, int scale) { - if (dec == null) { - return null; + // org.apache.hadoop.hive.ql.io.parquet.write.DataWritableWriter.DecimalDataWriter.decimalToBinary() + private static GenericData.Fixed decimalToBinary(HiveDecimal hiveDecimal, + Schema avroSchema, int prec, int scale) { + byte[] decimalBytes = hiveDecimal.setScale(scale).unscaledValue().toByteArray(); + + // Estimated number of bytes needed. + int precToBytes = AvroSchemaUtils.PRECISION_TO_BYTE_COUNT[prec - 1]; + if (precToBytes == decimalBytes.length) { + // No padding needed. + return new GenericData.Fixed(avroSchema, decimalBytes); + } + + byte[] tgt = new byte[precToBytes]; + if (hiveDecimal.signum() == -1) { + // For negative number, initializing bits to 1 + for (int i = 0; i < precToBytes; i++) { + tgt[i] |= 0xFF; + } } - return ByteBuffer.wrap(dec.bigIntegerBytesScaled(scale)); + System.arraycopy(decimalBytes, 0, tgt, precToBytes - decimalBytes.length, + decimalBytes.length); // Padding leading zeroes/ones. + return new GenericData.Fixed(avroSchema, tgt); } } diff --git a/java/bench/core/src/java/org/apache/orc/bench/core/convert/json/JsonReader.java b/java/bench/core/src/java/org/apache/orc/bench/core/convert/json/JsonReader.java index 893b738b1c..ece88f08b8 100644 --- a/java/bench/core/src/java/org/apache/orc/bench/core/convert/json/JsonReader.java +++ b/java/bench/core/src/java/org/apache/orc/bench/core/convert/json/JsonReader.java @@ -172,8 +172,12 @@ public void convert(JsonElement value, ColumnVector vect, int row) { vect.isNull[row] = true; } else { TimestampColumnVector vector = (TimestampColumnVector) vect; - vector.set(row, Timestamp.valueOf(value.getAsString() - .replaceAll("[TZ]", " "))); + try { + vector.set(row, new Timestamp(value.getAsLong())); + } catch (NumberFormatException e) { + vector.set(row, Timestamp.valueOf(value.getAsString() + .replaceAll("[TZ]", " "))); + } } } } diff --git a/java/bench/core/src/java/org/apache/orc/bench/core/convert/json/JsonWriter.java b/java/bench/core/src/java/org/apache/orc/bench/core/convert/json/JsonWriter.java index 00b3de22e6..527d8bf1cc 100644 --- a/java/bench/core/src/java/org/apache/orc/bench/core/convert/json/JsonWriter.java +++ b/java/bench/core/src/java/org/apache/orc/bench/core/convert/json/JsonWriter.java @@ -160,8 +160,7 @@ static void printValue(com.google.gson.stream.JsonWriter writer, ColumnVector ve (int) ((LongColumnVector) vector).vector[row]).toString()); break; case TIMESTAMP: - writer.value(((TimestampColumnVector) vector) - .asScratchTimestamp(row).toString()); + writer.value(((TimestampColumnVector) vector).getTimestampAsLong(row)); break; case LIST: printList(writer, (ListColumnVector) vector, schema, row); diff --git a/java/bench/core/src/resources/log4j.properties b/java/bench/core/src/resources/log4j.properties deleted file mode 100644 index 0df3f70e53..0000000000 --- a/java/bench/core/src/resources/log4j.properties +++ /dev/null @@ -1,18 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -log4j.rootLogger=WARN, CONSOLE - -# CONSOLE is set to be a ConsoleAppender using a PatternLayout -log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender -log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout -log4j.appender.CONSOLE.layout.ConversionPattern=[%-5p] %m%n diff --git a/java/bench/core/src/resources/taxi.schema b/java/bench/core/src/resources/taxi.schema index 720848faaa..adb1f54f8d 100644 --- a/java/bench/core/src/resources/taxi.schema +++ b/java/bench/core/src/resources/taxi.schema @@ -9,13 +9,13 @@ struct< PULocationID: bigint, DOLocationID: bigint, payment_type: bigint, - fare_amount: decimal(8,2), - extra: decimal(8,2), - mta_tax: decimal(8,2), - tip_amount: decimal(8,2), - tolls_amount: decimal(8,2), - improvement_surcharge: decimal(8,2), - total_amount: decimal(8,2), + fare_amount: decimal(10,2), + extra: decimal(10,2), + mta_tax: decimal(10,2), + tip_amount: decimal(10,2), + tolls_amount: decimal(10,2), + improvement_surcharge: decimal(10,2), + total_amount: decimal(10,2), congestion_surcharge: int, airport_fee: int > diff --git a/java/bench/core/src/test/org/apache/orc/bench/core/impl/ChunkReadUtilTest.java b/java/bench/core/src/test/org/apache/orc/bench/core/impl/ChunkReadUtilTest.java index 1169998d86..7091927521 100644 --- a/java/bench/core/src/test/org/apache/orc/bench/core/impl/ChunkReadUtilTest.java +++ b/java/bench/core/src/test/org/apache/orc/bench/core/impl/ChunkReadUtilTest.java @@ -21,6 +21,9 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.util.VersionInfo; +import org.apache.orc.impl.HadoopShims; +import org.apache.orc.impl.HadoopShimsFactory; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; @@ -38,6 +41,9 @@ class ChunkReadUtilTest { private static long fileLength; private static final int ROW_COUNT = 524288; private static final int COL_COUNT = 16; + private static final HadoopShims SHIMS = HadoopShimsFactory.get(); + private static final boolean supportVectoredIO = + SHIMS.supportVectoredIO(VersionInfo.getVersion()); @BeforeAll public static void setup() throws IOException { @@ -57,7 +63,7 @@ public void testReadAll() throws IOException { Configuration conf = new Configuration(); readStart(); assertEquals(ROW_COUNT, ChunkReadUtil.readORCFile(filePath, conf, false)); - assertTrue((readEnd().getBytesRead() / (double) fileLength) > 1); + assertTrue(supportVectoredIO || (readEnd().getBytesRead() / (double) fileLength) > 1); } @Test @@ -75,7 +81,7 @@ public void testReadAlternateWMinSeekSize() throws IOException { readStart(); assertEquals(ROW_COUNT, ChunkReadUtil.readORCFile(filePath, conf, true)); double readFraction = readEnd().getBytesRead() / (double) fileLength; - assertTrue(readFraction > 1 && readFraction < 1.01); + assertTrue(supportVectoredIO || (readFraction > 1 && readFraction < 1.01)); } @Test @@ -85,6 +91,6 @@ public void testReadAlternateWMinSeekSizeDrop() throws IOException { readStart(); assertEquals(ROW_COUNT, ChunkReadUtil.readORCFile(filePath, conf, true)); double readFraction = readEnd().getBytesRead() / (double) fileLength; - assertTrue(readFraction > 1 && readFraction < 1.01); + assertTrue(supportVectoredIO || (readFraction > 1 && readFraction < 1.01)); } -} \ No newline at end of file +} diff --git a/java/bench/hive/pom.xml b/java/bench/hive/pom.xml index 8dba74a0de..1ede9d05da 100644 --- a/java/bench/hive/pom.xml +++ b/java/bench/hive/pom.xml @@ -17,7 +17,7 @@ org.apache.orc orc-benchmarks - 2.1.0-SNAPSHOT + 2.3.0-SNAPSHOT .. diff --git a/java/bench/hive/src/java/org/apache/orc/bench/hive/ColumnProjectionBenchmark.java b/java/bench/hive/src/java/org/apache/orc/bench/hive/ColumnProjectionBenchmark.java index 9c1b7fd21a..48806faffe 100644 --- a/java/bench/hive/src/java/org/apache/orc/bench/hive/ColumnProjectionBenchmark.java +++ b/java/bench/hive/src/java/org/apache/orc/bench/hive/ColumnProjectionBenchmark.java @@ -19,6 +19,7 @@ package org.apache.orc.bench.hive; import com.google.auto.service.AutoService; +import org.apache.commons.cli.CommandLine; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -38,6 +39,7 @@ import org.apache.orc.bench.core.IOCounters; import org.apache.orc.bench.core.OrcBenchmark; import org.apache.orc.bench.core.Utilities; +import org.apache.orc.bench.core.convert.GenerateVariants; import org.apache.parquet.hadoop.ParquetInputFormat; import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.BenchmarkMode; @@ -47,6 +49,7 @@ import org.openjdk.jmh.annotations.Scope; import org.openjdk.jmh.annotations.State; import org.openjdk.jmh.runner.Runner; +import org.openjdk.jmh.runner.options.OptionsBuilder; import java.net.URI; import java.util.List; @@ -78,7 +81,13 @@ public String getDescription() { @Override public void run(String[] args) throws Exception { - new Runner(Utilities.parseOptions(args, getClass())).run(); + CommandLine cmds = GenerateVariants.parseCommandLine(args); + new Runner(new OptionsBuilder() + .parent(Utilities.parseOptions(args, this.getClass())) + .param("compression", cmds.getOptionValue("compress", "snappy,gz,zstd").split(",")) + .param("dataset", cmds.getOptionValue("data", "github,sales,taxi").split(",")) + .build() + ).run(); } @Benchmark diff --git a/java/bench/hive/src/java/org/apache/orc/bench/hive/FullReadBenchmark.java b/java/bench/hive/src/java/org/apache/orc/bench/hive/FullReadBenchmark.java index dc1bcca922..8f3b1cbbaa 100644 --- a/java/bench/hive/src/java/org/apache/orc/bench/hive/FullReadBenchmark.java +++ b/java/bench/hive/src/java/org/apache/orc/bench/hive/FullReadBenchmark.java @@ -25,6 +25,7 @@ import org.apache.avro.generic.GenericRecord; import org.apache.avro.io.DatumReader; import org.apache.avro.mapred.FsInput; +import org.apache.commons.cli.CommandLine; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -45,6 +46,7 @@ import org.apache.orc.bench.core.IOCounters; import org.apache.orc.bench.core.OrcBenchmark; import org.apache.orc.bench.core.Utilities; +import org.apache.orc.bench.core.convert.GenerateVariants; import org.apache.parquet.hadoop.ParquetInputFormat; import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.BenchmarkMode; @@ -54,6 +56,7 @@ import org.openjdk.jmh.annotations.Scope; import org.openjdk.jmh.annotations.State; import org.openjdk.jmh.runner.Runner; +import org.openjdk.jmh.runner.options.OptionsBuilder; import java.io.InputStream; import java.io.InputStreamReader; @@ -87,7 +90,13 @@ public String getDescription() { @Override public void run(String[] args) throws Exception { - new Runner(Utilities.parseOptions(args, getClass())).run(); + CommandLine cmds = GenerateVariants.parseCommandLine(args); + new Runner(new OptionsBuilder() + .parent(Utilities.parseOptions(args, this.getClass())) + .param("compression", cmds.getOptionValue("compress", "gz,snappy,zstd").split(",")) + .param("dataset", cmds.getOptionValue("data", "taxi,sales,github").split(",")) + .build() + ).run(); } @Benchmark diff --git a/java/bench/pom.xml b/java/bench/pom.xml index a50eb3a425..025d0ab97e 100644 --- a/java/bench/pom.xml +++ b/java/bench/pom.xml @@ -17,7 +17,7 @@ org.apache.orc orc - 2.1.0-SNAPSHOT + 2.3.0-SNAPSHOT ../pom.xml @@ -33,13 +33,15 @@ - 1.11.3 - 4.0.0 + 1.12.0 + 4.0.1 1.37 - 5.10.2 + 5.12.2 ${project.version} - 1.13.1 - 3.5.1 + 1.15.2 + 2.13 + 2.13.16 + 4.0.0 @@ -71,7 +73,7 @@ com.google.code.gson gson - 2.2.4 + 2.13.0 com.google.guava @@ -80,17 +82,17 @@ commons-cli commons-cli - 1.6.0 + 1.9.0 io.airlift aircompressor - 0.26 + 2.0.2 io.netty netty-all - 4.1.96.Final + 4.1.110.Final runtime @@ -106,7 +108,7 @@ org.apache.commons commons-csv - 1.10.0 + 1.14.0 org.apache.hadoop @@ -275,7 +277,7 @@ org.xerial.snappy snappy-java - 1.1.10.5 + 1.1.10.7 org.apache.parquet @@ -284,12 +286,12 @@ org.apache.spark - spark-catalyst_2.12 + spark-catalyst_${scala.binary.version} ${spark.version} org.apache.spark - spark-core_2.12 + spark-core_${scala.binary.version} ${spark.version} @@ -316,7 +318,7 @@ org.apache.spark - spark-sql_2.12 + spark-sql_${scala.binary.version} ${spark.version} @@ -335,7 +337,7 @@ org.apache.spark - spark-avro_2.12 + spark-avro_${scala.binary.version} ${spark.version} @@ -357,7 +359,7 @@ org.scala-lang scala-library - 2.12.18 + ${scala.version} org.slf4j diff --git a/java/bench/spark/pom.xml b/java/bench/spark/pom.xml index 7eeef0d00a..0ad26152a1 100644 --- a/java/bench/spark/pom.xml +++ b/java/bench/spark/pom.xml @@ -17,7 +17,7 @@ org.apache.orc orc-benchmarks - 2.1.0-SNAPSHOT + 2.3.0-SNAPSHOT .. @@ -32,7 +32,7 @@ UTF-8 - 2.15.2 + 2.18.2 @@ -51,6 +51,8 @@ org.apache.commons commons-lang3 + + compile org.apache.hive @@ -71,15 +73,15 @@ org.apache.spark - spark-catalyst_2.12 + spark-catalyst_${scala.binary.version} org.apache.spark - spark-core_2.12 + spark-core_${scala.binary.version} org.apache.spark - spark-sql_2.12 + spark-sql_${scala.binary.version} org.apache.parquet @@ -88,7 +90,7 @@ org.apache.spark - spark-avro_2.12 + spark-avro_${scala.binary.version} org.jodd @@ -125,7 +127,7 @@ org.objenesis objenesis - 3.2 + 3.3 compile diff --git a/java/bench/spark/src/java/org/apache/orc/bench/spark/SparkBenchmark.java b/java/bench/spark/src/java/org/apache/orc/bench/spark/SparkBenchmark.java index 1285875dcf..86e65ae81e 100644 --- a/java/bench/spark/src/java/org/apache/orc/bench/spark/SparkBenchmark.java +++ b/java/bench/spark/src/java/org/apache/orc/bench/spark/SparkBenchmark.java @@ -61,9 +61,9 @@ import scala.Tuple2; import scala.collection.Iterator; import scala.collection.JavaConverters; -import scala.collection.Seq; import scala.collection.immutable.Map; import scala.collection.immutable.Map$; +import scala.collection.immutable.Seq; import java.io.IOException; import java.sql.Timestamp; @@ -74,7 +74,8 @@ @BenchmarkMode(Mode.AverageTime) @OutputTimeUnit(TimeUnit.MICROSECONDS) @AutoService(OrcBenchmark.class) -@Fork(jvmArgsAppend = "--add-opens=java.base/sun.nio.ch=ALL-UNNAMED") +@Fork(jvmArgsAppend = {"--add-opens=java.base/sun.nio.ch=ALL-UNNAMED", + "--add-opens=java.base/sun.util.calendar=ALL-UNNAMED"}) public class SparkBenchmark implements OrcBenchmark { private static final Path root = Utilities.getBenchmarkRoot(); @@ -195,6 +196,9 @@ public void fullRead(InputSource source, case "orc": options.add(new Tuple2<>("returning_batch", "true")); // SPARK-40918 break; + case "parquet": + options.add(new Tuple2<>("returning_batch", "true")); // SPARK-40918 + break; default: break; } @@ -228,6 +232,9 @@ public void partialRead(InputSource source, case "orc": options.add(new Tuple2<>("returning_batch", "true")); // SPARK-40918 break; + case "parquet": + options.add(new Tuple2<>("returning_batch", "true")); // SPARK-40918 + break; default: break; } @@ -303,6 +310,9 @@ public void pushDown(InputSource source, case "orc": options.add(new Tuple2<>("returning_batch", "true")); // SPARK-40918 break; + case "parquet": + options.add(new Tuple2<>("returning_batch", "true")); // SPARK-40918 + break; default: break; } diff --git a/java/core/pom.xml b/java/core/pom.xml index 4cafffc714..5095c1be1c 100644 --- a/java/core/pom.xml +++ b/java/core/pom.xml @@ -17,7 +17,7 @@ org.apache.orc orc - 2.1.0-SNAPSHOT + 2.3.0-SNAPSHOT ../pom.xml @@ -43,10 +43,6 @@ com.google.protobuf protobuf-java - - org.apache.commons - commons-lang3 - io.airlift aircompressor @@ -84,8 +80,18 @@ com.aayushatharva.brotli4j brotli4j + + org.locationtech.jts + jts-core + ${jts.version} + + + org.apache.commons + commons-lang3 + test + com.google.guava guava @@ -138,6 +144,11 @@ org.apache.maven.plugins maven-compiler-plugin + + + -proc:full + + org.apache.maven.plugins diff --git a/java/core/src/java/org/apache/orc/GeospatialColumnStatistics.java b/java/core/src/java/org/apache/orc/GeospatialColumnStatistics.java new file mode 100644 index 0000000000..db66084c13 --- /dev/null +++ b/java/core/src/java/org/apache/orc/GeospatialColumnStatistics.java @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc; + +import org.apache.orc.geospatial.BoundingBox; +import org.apache.orc.geospatial.GeospatialTypes; + +public interface GeospatialColumnStatistics extends ColumnStatistics { + BoundingBox getBoundingBox(); + GeospatialTypes getGeospatialTypes(); +} diff --git a/java/core/src/java/org/apache/orc/OrcConf.java b/java/core/src/java/org/apache/orc/OrcConf.java index 9bc2b4492e..6516517ba2 100644 --- a/java/core/src/java/org/apache/orc/OrcConf.java +++ b/java/core/src/java/org/apache/orc/OrcConf.java @@ -18,7 +18,6 @@ package org.apache.orc; -import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import java.util.ArrayList; @@ -353,12 +352,12 @@ public String getString(Configuration conf) { public List getStringAsList(Configuration conf) { String value = getString(null, conf); List confList = new ArrayList<>(); - if (StringUtils.isEmpty(value)) { + if (value == null || value.isEmpty()) { return confList; } for (String str: value.split(",")) { - String trimStr = StringUtils.trim(str); - if (StringUtils.isNotEmpty(trimStr)) { + String trimStr = str.trim(); + if (!trimStr.isEmpty()) { confList.add(trimStr); } } diff --git a/java/core/src/java/org/apache/orc/OrcUtils.java b/java/core/src/java/org/apache/orc/OrcUtils.java index 7dde0bc0fd..ded04b8abc 100644 --- a/java/core/src/java/org/apache/orc/OrcUtils.java +++ b/java/core/src/java/org/apache/orc/OrcUtils.java @@ -17,6 +17,7 @@ */ package org.apache.orc; +import org.apache.orc.TypeDescription.EdgeInterpolationAlgorithm; import org.apache.orc.impl.ParserUtils; import org.apache.orc.impl.ReaderImpl; import org.apache.orc.impl.SchemaEvolution; @@ -171,6 +172,23 @@ private static void appendOrcTypes(List result, TypeDescription t type.setPrecision(typeDescr.getPrecision()); type.setScale(typeDescr.getScale()); break; + case Geography: + type.setKind(OrcProto.Type.Kind.GEOGRAPHY); + type.setAlgorithm(switch (typeDescr.getEdgeInterpolationAlgorithm()) { + case SPHERICAL -> OrcProto.Type.EdgeInterpolationAlgorithm.SPHERICAL; + case VINCENTY -> OrcProto.Type.EdgeInterpolationAlgorithm.VINCENTY; + case THOMAS -> OrcProto.Type.EdgeInterpolationAlgorithm.THOMAS; + case ANDOYER -> OrcProto.Type.EdgeInterpolationAlgorithm.ANDOYER; + case KARNEY -> OrcProto.Type.EdgeInterpolationAlgorithm.KARNEY; + default -> throw new IllegalArgumentException("Unknown interpolation algorithm: " + + typeDescr.getEdgeInterpolationAlgorithm()); + }); + type.setCrs(typeDescr.getCrs()); + break; + case Geometry: + type.setKind(OrcProto.Type.Kind.GEOMETRY); + type.setCrs(typeDescr.getCrs()); + break; case LIST: type.setKind(OrcProto.Type.Kind.LIST); type.addSubtypes(children.get(0).getId()); @@ -325,6 +343,29 @@ TypeDescription convertTypeFromProtobuf(List types, result.withPrecision(type.getPrecision()); } break; + case GEOMETRY: + result = TypeDescription.createGeometry(); + if (type.hasCrs()) { + result.withCRS(type.getCrs()); + } + break; + case GEOGRAPHY: + result = TypeDescription.createGeography(); + if (type.hasCrs()) { + result.withCRS(type.getCrs()); + } + result.withEdgeInterpolationAlgorithm( + switch (type.getAlgorithm()) { + case SPHERICAL -> EdgeInterpolationAlgorithm.SPHERICAL; + case VINCENTY -> EdgeInterpolationAlgorithm.VINCENTY; + case THOMAS -> EdgeInterpolationAlgorithm.THOMAS; + case ANDOYER -> EdgeInterpolationAlgorithm.ANDOYER; + case KARNEY -> EdgeInterpolationAlgorithm.KARNEY; + default -> throw new IllegalArgumentException("Unknown interpolation algorithm: " + + type.getAlgorithm()); + } + ); + break; case LIST: if (type.getSubtypesCount() != 1) { throw new FileFormatException("LIST type should contain exactly " + diff --git a/java/core/src/java/org/apache/orc/TypeDescription.java b/java/core/src/java/org/apache/orc/TypeDescription.java index 8ea9fca1b2..c5ef48b047 100644 --- a/java/core/src/java/org/apache/orc/TypeDescription.java +++ b/java/core/src/java/org/apache/orc/TypeDescription.java @@ -44,12 +44,29 @@ public class TypeDescription public static final long MAX_DECIMAL64 = 999_999_999_999_999_999L; public static final long MIN_DECIMAL64 = -MAX_DECIMAL64; private static final int DEFAULT_LENGTH = 256; + private static final String DEFAULT_CRS = "OGC:CRS84"; static final Pattern UNQUOTED_NAMES = Pattern.compile("^[a-zA-Z0-9_]+$"); // type attributes public static final String ENCRYPT_ATTRIBUTE = "encrypt"; public static final String MASK_ATTRIBUTE = "mask"; + public enum EdgeInterpolationAlgorithm { + SPHERICAL("spherical"), + VINCENTY("vincenty"), + THOMAS("thomas"), + ANDOYER("andoyer"), + KARNEY("karney"); + + EdgeInterpolationAlgorithm(String name) { + this.name = name; + } + final String name; + } + + private static final EdgeInterpolationAlgorithm DEFAULT_EDGE_INTERPOLATION_ALGORITHM + = EdgeInterpolationAlgorithm.SPHERICAL; + @Override public int compareTo(TypeDescription other) { if (this == other) { @@ -116,7 +133,9 @@ public enum Category { MAP("map", false), STRUCT("struct", false), UNION("uniontype", false), - TIMESTAMP_INSTANT("timestamp with local time zone", true); + TIMESTAMP_INSTANT("timestamp with local time zone", true), + Geometry("geometry", true), + Geography("geography", true); Category(String name, boolean isPrimitive) { this.name = name; @@ -187,6 +206,14 @@ public static TypeDescription createDecimal() { return new TypeDescription(Category.DECIMAL); } + public static TypeDescription createGeometry() { + return new TypeDescription(Category.Geometry); + } + + public static TypeDescription createGeography() { + return new TypeDescription(Category.Geography); + } + /** * Parse TypeDescription from the Hive type names. This is the inverse * of TypeDescription.toString() @@ -239,6 +266,26 @@ public TypeDescription withScale(int scale) { return this; } + public TypeDescription withCRS(String crs) { + if (category != Category.Geometry && + category != Category.Geography) { + throw new IllegalArgumentException("crs is only allowed on Geometry/Geography" + + " and not " + category.name); + } + this.crs = crs; + return this; + } + + public TypeDescription withEdgeInterpolationAlgorithm( + EdgeInterpolationAlgorithm edgeInterpolationAlgorithm) { + if (category != Category.Geography) { + throw new IllegalArgumentException("edgeInterpolationAlgorithm is only allowed on Geography" + + " and not " + category.name); + } + this.edgeInterpolationAlgorithm = edgeInterpolationAlgorithm; + return this; + } + /** * Set an attribute on this type. * @param key the attribute name @@ -366,6 +413,8 @@ public TypeDescription clone() { result.maxLength = maxLength; result.precision = precision; result.scale = scale; + result.crs = crs; + result.edgeInterpolationAlgorithm = edgeInterpolationAlgorithm; if (fieldNames != null) { result.fieldNames.addAll(fieldNames); } @@ -557,6 +606,14 @@ public int getScale() { return scale; } + public String getCrs() { + return crs; + } + + public EdgeInterpolationAlgorithm getEdgeInterpolationAlgorithm() { + return edgeInterpolationAlgorithm; + } + /** * For struct types, get the list of field names. * @return the list of field names. @@ -664,6 +721,9 @@ public TypeDescription(Category category) { private int maxLength = DEFAULT_LENGTH; private int precision = DEFAULT_PRECISION; private int scale = DEFAULT_SCALE; + private String crs = DEFAULT_CRS; + private EdgeInterpolationAlgorithm edgeInterpolationAlgorithm + = DEFAULT_EDGE_INTERPOLATION_ALGORITHM; static void printFieldName(StringBuilder buffer, String name) { if (UNQUOTED_NAMES.matcher(name).matches()) { @@ -691,6 +751,18 @@ public void printToBuffer(StringBuilder buffer) { buffer.append(maxLength); buffer.append(')'); break; + case Geometry: + buffer.append('('); + buffer.append(crs); + buffer.append(')'); + break; + case Geography: + buffer.append('('); + buffer.append(crs); + buffer.append(','); + buffer.append(edgeInterpolationAlgorithm.name()); + buffer.append(')'); + break; case LIST: case MAP: case UNION: @@ -751,6 +823,16 @@ private void printJsonToBuffer(String prefix, StringBuilder buffer, buffer.append(", \"length\": "); buffer.append(maxLength); break; + case Geometry: + buffer.append(", \"crs\": "); + buffer.append(crs); + break; + case Geography: + buffer.append(", \"crs\": "); + buffer.append(crs); + buffer.append(", \"edge_interpolation_algorithm\": "); + buffer.append(edgeInterpolationAlgorithm.name()); + break; case LIST: case MAP: case UNION: diff --git a/java/core/src/java/org/apache/orc/geospatial/BoundingBox.java b/java/core/src/java/org/apache/orc/geospatial/BoundingBox.java new file mode 100644 index 0000000000..093e2c96c8 --- /dev/null +++ b/java/core/src/java/org/apache/orc/geospatial/BoundingBox.java @@ -0,0 +1,361 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.geospatial; + +import org.locationtech.jts.geom.Coordinate; +import org.locationtech.jts.geom.Envelope; +import org.locationtech.jts.geom.Geometry; + +/** + * Bounding box for Geometry or Geography type in the representation of min/max + * value pairs of coordinates from each axis. + * A bounding box is considered valid if none of the X / Y dimensions contain NaN. + */ +public class BoundingBox { + + private double xMin = Double.POSITIVE_INFINITY; + private double xMax = Double.NEGATIVE_INFINITY; + private double yMin = Double.POSITIVE_INFINITY; + private double yMax = Double.NEGATIVE_INFINITY; + private double zMin = Double.POSITIVE_INFINITY; + private double zMax = Double.NEGATIVE_INFINITY; + private double mMin = Double.POSITIVE_INFINITY; + private double mMax = Double.NEGATIVE_INFINITY; + private boolean valid = true; + + public BoundingBox() { + } + + public BoundingBox( + double xMin, double xMax, double yMin, double yMax, + double zMin, double zMax, double mMin, double mMax) { + this.xMin = xMin; + this.xMax = xMax; + this.yMin = yMin; + this.yMax = yMax; + this.zMin = zMin; + this.zMax = zMax; + this.mMin = mMin; + this.mMax = mMax; + + // Update the validity + valid = isXYValid(); + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof BoundingBox other)) { + return false; + } + if (obj == this) { + return true; + } + + // Valid flag must be checked since invalid bounding boxes may have equal coordinates with the initial one + return xMin == other.xMin && xMax == other.xMax && yMin == other.yMin && yMax == other.yMax && + zMin == other.zMin && zMax == other.zMax && mMin == other.mMin && mMax == other.mMax && + valid == other.valid; + } + + @Override + public int hashCode() { + return Double.hashCode(xMin) ^ Double.hashCode(xMax) ^ + Double.hashCode(yMin) ^ Double.hashCode(yMax) ^ + Double.hashCode(zMin) ^ Double.hashCode(zMax) ^ + Double.hashCode(mMin) ^ Double.hashCode(mMax) ^ + Boolean.hashCode(valid); + } + + // Don't change `valid` here and let the caller maintain it + private void resetBBox() { + xMin = Double.POSITIVE_INFINITY; + xMax = Double.NEGATIVE_INFINITY; + yMin = Double.POSITIVE_INFINITY; + yMax = Double.NEGATIVE_INFINITY; + zMin = Double.POSITIVE_INFINITY; + zMax = Double.NEGATIVE_INFINITY; + mMin = Double.POSITIVE_INFINITY; + mMax = Double.NEGATIVE_INFINITY; + } + + public double getXMin() { + return xMin; + } + + public double getXMax() { + return xMax; + } + + public double getYMin() { + return yMin; + } + + public double getYMax() { + return yMax; + } + + public double getZMin() { + return zMin; + } + + public double getZMax() { + return zMax; + } + + public double getMMin() { + return mMin; + } + + public double getMMax() { + return mMax; + } + + /** + * Checks if the bounding box is valid. + * A bounding box is considered valid if none of the X / Y dimensions contain NaN. + * + * @return true if the bounding box is valid, false otherwise. + */ + public boolean isValid() { + return valid; + } + + /** + * Checks if the X and Y dimensions of the bounding box are valid. + * The X and Y dimensions are considered valid if none of the bounds contain NaN. + * + * @return true if the X and Y dimensions are valid, false otherwise. + */ + public boolean isXYValid() { + return isXValid() && isYValid(); + } + + /** + * Checks if the X dimension of the bounding box is valid. + * The X dimension is considered valid if neither bound contains NaN. + * + * @return true if the X dimension is valid, false otherwise. + */ + public boolean isXValid() { + return !(Double.isNaN(xMin) || Double.isNaN(xMax)); + } + + /** + * Checks if the Y dimension of the bounding box is valid. + * The Y dimension is considered valid if neither bound contains NaN. + * + * @return true if the Y dimension is valid, false otherwise. + */ + public boolean isYValid() { + return !(Double.isNaN(yMin) || Double.isNaN(yMax)); + } + + /** + * Checks if the Z dimension of the bounding box is valid. + * The Z dimension is considered valid if none of the bounds contain NaN. + * + * @return true if the Z dimension is valid, false otherwise. + */ + public boolean isZValid() { + return !(Double.isNaN(zMin) || Double.isNaN(zMax)); + } + + /** + * Checks if the M dimension of the bounding box is valid. + * The M dimension is considered valid if none of the bounds contain NaN. + * + * @return true if the M dimension is valid, false otherwise. + */ + public boolean isMValid() { + return !(Double.isNaN(mMin) || Double.isNaN(mMax)); + } + + /** + * Checks if the bounding box is empty in the X / Y dimension. + * + * @return true if the bounding box is empty, false otherwise. + */ + public boolean isXYEmpty() { + return isXEmpty() || isYEmpty(); + } + + /** + * Checks if the bounding box is empty in the X dimension. + * + * @return true if the X dimension is empty, false otherwise. + */ + public boolean isXEmpty() { + return Double.isInfinite(xMin) && Double.isInfinite(xMax); + } + + /** + * Checks if the bounding box is empty in the Y dimension. + * + * @return true if the Y dimension is empty, false otherwise. + */ + public boolean isYEmpty() { + return Double.isInfinite(yMin) && Double.isInfinite(yMax); + } + + /** + * Checks if the bounding box is empty in the Z dimension. + * + * @return true if the Z dimension is empty, false otherwise. + */ + public boolean isZEmpty() { + return Double.isInfinite(zMin) && Double.isInfinite(zMax); + } + + /** + * Checks if the bounding box is empty in the M dimension. + * + * @return true if the M dimension is empty, false otherwise. + */ + public boolean isMEmpty() { + return Double.isInfinite(mMin) && Double.isInfinite(mMax); + } + + /** + * Expands this bounding box to include the bounds of another box. + * After merging, this bounding box will contain both its original extent + * and the extent of the other bounding box. + * + * @param other the other BoundingBox whose bounds will be merged into this one + */ + public void merge(BoundingBox other) { + if (!valid) { + return; + } + + // If other is null or invalid, mark this as invalid + if (other == null || !other.valid) { + valid = false; + resetBBox(); + return; + } + + this.xMin = Math.min(this.xMin, other.xMin); + this.xMax = Math.max(this.xMax, other.xMax); + this.yMin = Math.min(this.yMin, other.yMin); + this.yMax = Math.max(this.yMax, other.yMax); + this.zMin = Math.min(this.zMin, other.zMin); + this.zMax = Math.max(this.zMax, other.zMax); + this.mMin = Math.min(this.mMin, other.mMin); + this.mMax = Math.max(this.mMax, other.mMax); + + // Update the validity of this bounding box based on the other bounding box + valid = isXYValid(); + } + + /** + * Extends this bounding box to include the spatial extent of the provided geometry. + * The bounding box coordinates (min/max values for x, y, z, m) will be adjusted + * to encompass both the current bounds and the geometry's bounds. + * + * @param geometry The geometry whose coordinates will be used to update this bounding box. + * If null or empty, the method returns without making any changes. + */ + public void update(Geometry geometry) { + if (!valid) { + return; + } + + if (geometry == null || geometry.isEmpty()) { + return; + } + + // Updates the X and Y bounds of this bounding box with the given coordinates. + // Updates are conditional: + // - X bounds are only updated if both minX and maxX are not NaN + // - Y bounds are only updated if both minY and maxY are not NaN + // This allows partial updates while preserving valid dimensions. + Envelope envelope = geometry.getEnvelopeInternal(); + if (!Double.isNaN(envelope.getMinX()) && !Double.isNaN(envelope.getMaxX())) { + xMin = Math.min(xMin, envelope.getMinX()); + xMax = Math.max(xMax, envelope.getMaxX()); + } + if (!Double.isNaN(envelope.getMinY()) && !Double.isNaN(envelope.getMaxY())) { + yMin = Math.min(yMin, envelope.getMinY()); + yMax = Math.max(yMax, envelope.getMaxY()); + } + + for (Coordinate coord : geometry.getCoordinates()) { + if (!Double.isNaN(coord.getZ())) { + zMin = Math.min(zMin, coord.getZ()); + zMax = Math.max(zMax, coord.getZ()); + } + if (!Double.isNaN(coord.getM())) { + mMin = Math.min(mMin, coord.getM()); + mMax = Math.max(mMax, coord.getM()); + } + } + + // Update the validity of this bounding box based on the other bounding box + valid = isXYValid(); + } + + /** + * Resets the bounding box to its initial state. + */ + public void reset() { + resetBBox(); + valid = true; + } + + /** + * Creates a copy of the current bounding box. + * + * @return a new BoundingBox instance with the same values as this one. + */ + public BoundingBox copy() { + return new BoundingBox( + this.xMin, this.xMax, + this.yMin, this.yMax, + this.zMin, this.zMax, + this.mMin, this.mMax); + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder("BoundingBox{xMin=") + .append(xMin) + .append(", xMax=") + .append(xMax) + .append(", yMin=") + .append(yMin) + .append(", yMax=") + .append(yMax) + .append(", zMin=") + .append(zMin) + .append(", zMax=") + .append(zMax) + .append(", mMin=") + .append(mMin) + .append(", mMax=") + .append(mMax); + + // Only include the valid flag when it's false + if (!valid) { + sb.append(", valid=false"); + } + + sb.append('}'); + return sb.toString(); + } +} diff --git a/java/core/src/java/org/apache/orc/geospatial/GeospatialTypes.java b/java/core/src/java/org/apache/orc/geospatial/GeospatialTypes.java new file mode 100644 index 0000000000..c1067adad4 --- /dev/null +++ b/java/core/src/java/org/apache/orc/geospatial/GeospatialTypes.java @@ -0,0 +1,232 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.geospatial; + +import org.locationtech.jts.geom.Coordinate; +import org.locationtech.jts.geom.Geometry; + +import java.util.HashSet; +import java.util.Set; +import java.util.stream.Collectors; + +/** + * A list of geospatial types from all instances in the Geometry or Geography column, + * or an empty list if they are not known. + * + * The GeospatialTypes instance becomes invalid in the following cases: + * - When an unknown or unsupported geometry type is encountered during update + * - When merging with another invalid GeospatialTypes instance + * - When explicitly aborted using abort() + * + * When invalid, the types list is cleared and remains empty. All subsequent + * updates and merges are ignored until reset() is called. + */ +public class GeospatialTypes { + + private static final int UNKNOWN_TYPE_ID = -1; + private Set types = new HashSet<>(); + private boolean valid = true; + + public GeospatialTypes(Set types) { + this.types = types; + this.valid = true; + } + + public GeospatialTypes(Set types, boolean valid) { + this.types = types; + this.valid = valid; + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof GeospatialTypes other)) { + return false; + } + if (obj == this) { + return true; + } + return valid == other.valid && types.equals(other.types); + } + + @Override + public int hashCode() { + return types.hashCode() ^ Boolean.hashCode(valid); + } + + public GeospatialTypes() {} + + public Set getTypes() { + return types; + } + + /** + * Updates the types list with the given geometry's type. + * If the geometry type is unknown, the instance becomes invalid. + * + * @param geometry the geometry to process + */ + public void update(Geometry geometry) { + if (!valid) { + return; + } + + if (geometry == null || geometry.isEmpty()) { + return; + } + + int code = getGeometryTypeCode(geometry); + if (code != UNKNOWN_TYPE_ID) { + types.add(code); + } else { + valid = false; + types.clear(); + } + } + + public void merge(GeospatialTypes other) { + if (!valid) { + return; + } + + if (other == null || !other.valid) { + valid = false; + types.clear(); + return; + } + types.addAll(other.types); + } + + public void reset() { + types.clear(); + valid = true; + } + + public boolean isValid() { + return valid; + } + + public GeospatialTypes copy() { + return new GeospatialTypes(new HashSet<>(types), valid); + } + + /** + * Extracts the base geometry type code from a full type code. + * For example: 1001 (XYZ Point) -> 1 (Point) + * + * @param typeId the full geometry type code + * @return the base type code (1-7) + */ + private int getBaseTypeCode(int typeId) { + return typeId % 1000; + } + + /** + * Extracts the dimension prefix from a full type code. + * For example: 1001 (XYZ Point) -> 1000 (XYZ) + * + * @param typeId the full geometry type code + * @return the dimension prefix (0, 1000, 2000, or 3000) + */ + private int getDimensionPrefix(int typeId) { + return (typeId / 1000) * 1000; + } + + @Override + public String toString() { + return "GeospatialTypes{" + "types=" + + types.stream().map(this::typeIdToString).collect(Collectors.toSet()) + '}'; + } + + private int getGeometryTypeId(Geometry geometry) { + return switch (geometry.getGeometryType()) { + case Geometry.TYPENAME_POINT -> 1; + case Geometry.TYPENAME_LINESTRING -> 2; + case Geometry.TYPENAME_POLYGON -> 3; + case Geometry.TYPENAME_MULTIPOINT -> 4; + case Geometry.TYPENAME_MULTILINESTRING -> 5; + case Geometry.TYPENAME_MULTIPOLYGON -> 6; + case Geometry.TYPENAME_GEOMETRYCOLLECTION -> 7; + default -> UNKNOWN_TYPE_ID; + }; + } + + /** + * Geospatial type codes: + * + * | Type | XY | XYZ | XYM | XYZM | + * | :----------------- | :--- | :--- | :--- | :--: | + * | Point | 0001 | 1001 | 2001 | 3001 | + * | LineString | 0002 | 1002 | 2002 | 3002 | + * | Polygon | 0003 | 1003 | 2003 | 3003 | + * | MultiPoint | 0004 | 1004 | 2004 | 3004 | + * | MultiLineString | 0005 | 1005 | 2005 | 3005 | + * | MultiPolygon | 0006 | 1006 | 2006 | 3006 | + * | GeometryCollection | 0007 | 1007 | 2007 | 3007 | + * + * See https://github.com/apache/parquet-format/blob/master/Geospatial.md#geospatial-types + */ + private int getGeometryTypeCode(Geometry geometry) { + int typeId = getGeometryTypeId(geometry); + if (typeId == UNKNOWN_TYPE_ID) { + return UNKNOWN_TYPE_ID; + } + Coordinate[] coordinates = geometry.getCoordinates(); + boolean hasZ = false; + boolean hasM = false; + if (coordinates.length > 0) { + Coordinate firstCoord = coordinates[0]; + hasZ = !Double.isNaN(firstCoord.getZ()); + hasM = !Double.isNaN(firstCoord.getM()); + } + if (hasZ) { + typeId += 1000; + } + if (hasM) { + typeId += 2000; + } + return typeId; + } + + private String typeIdToString(int typeId) { + String typeString; + + typeString = switch (typeId % 1000) { + case 1 -> Geometry.TYPENAME_POINT; + case 2 -> Geometry.TYPENAME_LINESTRING; + case 3 -> Geometry.TYPENAME_POLYGON; + case 4 -> Geometry.TYPENAME_MULTIPOINT; + case 5 -> Geometry.TYPENAME_MULTILINESTRING; + case 6 -> Geometry.TYPENAME_MULTIPOLYGON; + case 7 -> Geometry.TYPENAME_GEOMETRYCOLLECTION; + default -> { + yield "Unknown"; + } + }; + if (typeId >= 3000) { + typeString += " (XYZM)"; + } else if (typeId >= 2000) { + typeString += " (XYM)"; + } else if (typeId >= 1000) { + typeString += " (XYZ)"; + } else { + typeString += " (XY)"; + } + return typeString; + } +} diff --git a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java index c5e13cc3c0..46b87bfdef 100644 --- a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java +++ b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java @@ -30,11 +30,17 @@ import org.apache.orc.DateColumnStatistics; import org.apache.orc.DecimalColumnStatistics; import org.apache.orc.DoubleColumnStatistics; +import org.apache.orc.GeospatialColumnStatistics; import org.apache.orc.IntegerColumnStatistics; import org.apache.orc.OrcProto; import org.apache.orc.StringColumnStatistics; import org.apache.orc.TimestampColumnStatistics; import org.apache.orc.TypeDescription; +import org.apache.orc.geospatial.BoundingBox; +import org.apache.orc.geospatial.GeospatialTypes; +import org.locationtech.jts.geom.Geometry; +import org.locationtech.jts.io.ParseException; +import org.locationtech.jts.io.WKBReader; import org.threeten.extra.chrono.HybridChronology; import java.sql.Date; @@ -42,6 +48,11 @@ import java.time.chrono.ChronoLocalDate; import java.time.chrono.Chronology; import java.time.chrono.IsoChronology; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; import java.util.TimeZone; @@ -102,6 +113,8 @@ public void updateBoolean(boolean value, int repetitions) { public void merge(ColumnStatisticsImpl other) { if (other instanceof BooleanStatisticsImpl bkt) { trueCount += bkt.trueCount; + } else if (!(other instanceof BooleanColumnStatistics)) { + throw new IllegalArgumentException("Incompatible merging of boolean column statistics"); } else { if (isStatsExists() && trueCount != 0) { throw new IllegalArgumentException("Incompatible merging of boolean column statistics"); @@ -222,6 +235,8 @@ public void merge(ColumnStatisticsImpl other) { } } sum += otherColl.sum; + } else if (!(other instanceof CollectionColumnStatistics)) { + throw new IllegalArgumentException("Incompatible merging of collection column statistics"); } else { if (isStatsExists()) { throw new IllegalArgumentException("Incompatible merging of collection column statistics"); @@ -397,6 +412,8 @@ public void merge(ColumnStatisticsImpl other) { overflow = true; } } + } else if (!(other instanceof IntegerColumnStatistics)) { + throw new IllegalArgumentException("Incompatible merging of integer column statistics"); } else { if (isStatsExists() && hasMinimum) { throw new IllegalArgumentException("Incompatible merging of integer column statistics"); @@ -560,6 +577,8 @@ public void merge(ColumnStatisticsImpl other) { } } sum += dbl.sum; + } else if (!(other instanceof DoubleColumnStatistics)) { + throw new IllegalArgumentException("Incompatible merging of double column statistics"); } else { if (isStatsExists() && hasMinimum) { throw new IllegalArgumentException("Incompatible merging of double column statistics"); @@ -763,6 +782,8 @@ public void merge(ColumnStatisticsImpl other) { } } sum += str.sum; + } else if (!(other instanceof StringColumnStatistics)) { + throw new IllegalArgumentException("Incompatible merging of string column statistics"); } else { if (isStatsExists()) { throw new IllegalArgumentException("Incompatible merging of string column statistics"); @@ -993,9 +1014,10 @@ public void updateBinary(byte[] bytes, int offset, int length, @Override public void merge(ColumnStatisticsImpl other) { - if (other instanceof BinaryColumnStatistics) { - BinaryStatisticsImpl bin = (BinaryStatisticsImpl) other; + if (other instanceof BinaryStatisticsImpl bin) { sum += bin.sum; + } else if (!(other instanceof BinaryColumnStatistics)) { + throw new IllegalArgumentException("Incompatible merging of binary column statistics"); } else { if (isStatsExists() && sum != 0) { throw new IllegalArgumentException("Incompatible merging of binary column statistics"); @@ -1128,6 +1150,8 @@ public void merge(ColumnStatisticsImpl other) { sum.mutateAdd(dec.sum); } } + } else if (!(other instanceof DecimalColumnStatistics)) { + throw new IllegalArgumentException("Incompatible merging of decimal column statistics"); } else { if (isStatsExists() && minimum != null) { throw new IllegalArgumentException("Incompatible merging of decimal column statistics"); @@ -1321,6 +1345,8 @@ public void merge(ColumnStatisticsImpl other) { hasSum = false; } } + } else if (!(other instanceof DecimalColumnStatistics)) { + throw new IllegalArgumentException("Incompatible merging of decimal column statistics"); } else { if (other.getNumberOfValues() != 0) { throw new IllegalArgumentException("Incompatible merging of decimal column statistics"); @@ -1486,6 +1512,8 @@ public void merge(ColumnStatisticsImpl other) { if (other instanceof DateStatisticsImpl dateStats) { minimum = Math.min(minimum, dateStats.minimum); maximum = Math.max(maximum, dateStats.maximum); + } else if (!(other instanceof DateColumnStatistics)) { + throw new IllegalArgumentException("Incompatible merging of date column statistics"); } else { if (isStatsExists() && count != 0) { throw new IllegalArgumentException("Incompatible merging of date column statistics"); @@ -1698,6 +1726,8 @@ public void merge(ColumnStatisticsImpl other) { maximum = timestampStats.maximum; } } + } else if (!(other instanceof TimestampColumnStatistics)) { + throw new IllegalArgumentException("Incompatible merging of timestamp column statistics"); } else { if (isStatsExists() && count != 0) { throw new IllegalArgumentException("Incompatible merging of timestamp column statistics"); @@ -1839,6 +1869,167 @@ public Timestamp getMaximum() { private boolean hasNull = false; private long bytesOnDisk = 0; + private static final class GeospatialStatisticsImpl extends ColumnStatisticsImpl + implements GeospatialColumnStatistics { + private final BoundingBox boundingBox; + private final GeospatialTypes geospatialTypes; + private final WKBReader reader = new WKBReader(); + + GeospatialStatisticsImpl() { + this.boundingBox = new BoundingBox(); + this.geospatialTypes = new GeospatialTypes(); + } + + GeospatialStatisticsImpl(OrcProto.ColumnStatistics stats) { + super(stats); + BoundingBox boundingBoxOut = null; + GeospatialTypes geospatialTypesOut = null; + + OrcProto.GeospatialStatistics geoStatistics = stats.getGeospatialStatistics(); + if (geoStatistics.hasBbox()) { + OrcProto.BoundingBox bbox = geoStatistics.getBbox(); + boundingBoxOut = new BoundingBox( + bbox.hasXmin() ? bbox.getXmin() : Double.NaN, + bbox.hasXmax() ? bbox.getXmax() : Double.NaN, + bbox.hasYmin() ? bbox.getYmin() : Double.NaN, + bbox.hasYmax() ? bbox.getYmax() : Double.NaN, + bbox.hasZmin() ? bbox.getZmin() : Double.NaN, + bbox.hasZmax() ? bbox.getZmax() : Double.NaN, + bbox.hasMmin() ? bbox.getMmin() : Double.NaN, + bbox.hasMmax() ? bbox.getMmax() : Double.NaN); + } + + if (!geoStatistics.getGeospatialTypesList().isEmpty()) { + Set types = new HashSet<>(geoStatistics.getGeospatialTypesList()); + geospatialTypesOut = new GeospatialTypes(types); + } + this.boundingBox = boundingBoxOut; + this.geospatialTypes = geospatialTypesOut; + } + + @Override + public void updateGeometry(BytesWritable value) { + if (value == null) { + return; + } + + try { + Geometry geom = reader.read(value.getBytes()); + boundingBox.update(geom); + geospatialTypes.update(geom); + } catch (ParseException e) { + throw new IllegalArgumentException("Invalid geospatial data - failed to parse WKB format", e); + } + } + + @Override + public void updateGeometry(byte[] bytes, int offset, int length) { + if (bytes == null) { + return; + } + BytesWritable value = new BytesWritable(); + value.set(bytes, offset, length); + updateGeometry(value); + } + + @Override + public void reset() { + super.reset(); + boundingBox.reset();; + geospatialTypes.reset(); + } + + @Override + public void merge(ColumnStatisticsImpl other) { + if (other instanceof GeospatialStatisticsImpl geoStats) { + boundingBox.merge(geoStats.boundingBox); + geospatialTypes.merge(geoStats.geospatialTypes); + } else { + throw new IllegalArgumentException("Incompatible merging of geospatial column statistics"); + } + super.merge(other); + } + + @Override + public OrcProto.ColumnStatistics.Builder serialize() { + OrcProto.ColumnStatistics.Builder builder = super.serialize(); + OrcProto.GeospatialStatistics.Builder geoStats = OrcProto.GeospatialStatistics.newBuilder(); + + OrcProto.BoundingBox.Builder bboxBuilder = OrcProto.BoundingBox.newBuilder(); + if (boundingBox.isValid() && !boundingBox.isXYEmpty()) { + bboxBuilder.setXmin(boundingBox.getXMin()); + bboxBuilder.setXmax(boundingBox.getXMax()); + bboxBuilder.setYmin(boundingBox.getYMin()); + bboxBuilder.setYmax(boundingBox.getYMax()); + if (boundingBox.isZValid() && !boundingBox.isZEmpty()) { + bboxBuilder.setZmin(boundingBox.getZMin()); + bboxBuilder.setZmax(boundingBox.getZMax()); + } + if (boundingBox.isMValid() && !boundingBox.isMEmpty()) { + bboxBuilder.setMmin(boundingBox.getMMin()); + bboxBuilder.setMmax(boundingBox.getMMax()); + } + geoStats.setBbox(bboxBuilder); + } + if (geospatialTypes.isValid() && !geospatialTypes.getTypes().isEmpty()) { + List sortedTypes = new ArrayList<>(geospatialTypes.getTypes()); + Collections.sort(sortedTypes); + geoStats.addAllGeospatialTypes(sortedTypes); + } + builder.setGeospatialStatistics(geoStats); + return builder; + } + + @Override + public String toString() { + StringBuilder buf = new StringBuilder(super.toString()); + if (boundingBox.isValid()) { + buf.append(" bbox: "); + buf.append(boundingBox.toString()); + } + if (geospatialTypes.isValid()) { + buf.append(" types: "); + buf.append(geospatialTypes.toString()); + } + return buf.toString(); + } + + @Override + public BoundingBox getBoundingBox() { + return boundingBox; + } + + @Override + public GeospatialTypes getGeospatialTypes() { + return geospatialTypes; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof GeospatialStatisticsImpl that)) { + return false; + } + if (!super.equals(o)) { + return false; + } + + return boundingBox.equals(that.boundingBox) && + geospatialTypes.equals(that.geospatialTypes); + } + + @Override + public int hashCode() { + final int prime = 31; + int result = super.hashCode(); + result = prime * result + boundingBox.hashCode(); + result = prime * result + geospatialTypes.hashCode(); + return result; + } + } + ColumnStatisticsImpl(OrcProto.ColumnStatistics stats) { if (stats.hasNumberOfValues()) { count = stats.getNumberOfValues(); @@ -1936,6 +2127,14 @@ public void updateTimestamp(long value, int nanos) { throw new UnsupportedOperationException("Can't update timestamp"); } + public void updateGeometry(BytesWritable value) { + throw new UnsupportedOperationException("Can't update Geometry"); + } + + public void updateGeometry(byte[] bytes, int offset, int length) { + throw new UnsupportedOperationException("Can't update Geometry"); + } + public boolean isStatsExists() { return (count > 0 || hasNull == true); } @@ -2027,6 +2226,9 @@ public static ColumnStatisticsImpl create(TypeDescription schema, return new TimestampInstantStatisticsImpl(); case BINARY: return new BinaryStatisticsImpl(); + case Geography: + case Geometry: + return new GeospatialStatisticsImpl(); default: return new ColumnStatisticsImpl(); } @@ -2070,6 +2272,8 @@ public static ColumnStatisticsImpl deserialize(TypeDescription schema, writerUsedProlepticGregorian, convertToProlepticGregorian); } else if(stats.hasBinaryStatistics()) { return new BinaryStatisticsImpl(stats); + } else if (stats.hasGeospatialStatistics()) { + return new GeospatialStatisticsImpl(stats); } else { return new ColumnStatisticsImpl(stats); } diff --git a/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java b/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java index 4635973ab5..4861aa61fa 100644 --- a/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java +++ b/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java @@ -17,7 +17,6 @@ */ package org.apache.orc.impl; -import org.apache.commons.lang3.ArrayUtils; import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; @@ -1446,6 +1445,7 @@ public void nextVector(ColumnVector previousVector, } public static class StringGroupFromBinaryTreeReader extends ConvertTreeReader { + public static final byte[] EMPTY_BYTE_ARRAY = new byte[0]; private final TypeDescription readerType; private BytesColumnVector inBytesColVector; private BytesColumnVector outBytesColVector; @@ -1461,7 +1461,7 @@ public void setConvertVectorElement(int elementNum) throws IOException { byte[] bytes = inBytesColVector.vector[elementNum]; int start = inBytesColVector.start[elementNum]; int length = inBytesColVector.length[elementNum]; - final byte[] string = (length == 0) ? ArrayUtils.EMPTY_BYTE_ARRAY : new byte[3 * length - 1]; + final byte[] string = (length == 0) ? EMPTY_BYTE_ARRAY : new byte[3 * length - 1]; for(int p = 0; p < string.length; p += 2) { if (p != 0) { string[p++] = ' '; @@ -1533,6 +1533,7 @@ public void nextVector(ColumnVector previousVector, // Allocate column vector for file; cast column vector for reader. longColVector = new LongColumnVector(batchSize); timestampColVector = (TimestampColumnVector) previousVector; + timestampColVector.setIsUTC(useUtc); } else { longColVector.ensureSize(batchSize, false); } @@ -1597,6 +1598,7 @@ public void nextVector(ColumnVector previousVector, // Allocate column vector for file; cast column vector for reader. doubleColVector = new DoubleColumnVector(batchSize); timestampColVector = (TimestampColumnVector) previousVector; + timestampColVector.setIsUTC(useUtc); } else { doubleColVector.ensureSize(batchSize, false); } @@ -1661,6 +1663,7 @@ public void nextVector(ColumnVector previousVector, // Allocate column vector for file; cast column vector for reader. decimalColVector = new DecimalColumnVector(batchSize, precision, scale); timestampColVector = (TimestampColumnVector) previousVector; + timestampColVector.setIsUTC(useUtc); } else { decimalColVector.ensureSize(batchSize, false); } @@ -1676,6 +1679,7 @@ public void nextVector(ColumnVector previousVector, public static class TimestampFromStringGroupTreeReader extends ConvertTreeReader { private BytesColumnVector bytesColVector; private TimestampColumnVector timestampColVector; + private final boolean useUtc; private final DateTimeFormatter formatter; private final boolean useProlepticGregorian; @@ -1683,6 +1687,7 @@ public static class TimestampFromStringGroupTreeReader extends ConvertTreeReader Context context, boolean isInstant) throws IOException { super(columnId, getStringGroupTreeReader(columnId, fileType, context), context); + useUtc = isInstant || context.getUseUTCTimestamp(); useProlepticGregorian = context.useProlepticGregorian(); Chronology chronology = useProlepticGregorian ? IsoChronology.INSTANCE @@ -1722,6 +1727,7 @@ public void nextVector(ColumnVector previousVector, // Allocate column vector for file; cast column vector for reader. bytesColVector = new BytesColumnVector(batchSize); timestampColVector = (TimestampColumnVector) previousVector; + timestampColVector.setIsUTC(useUtc); } else { bytesColVector.ensureSize(batchSize, false); } @@ -1768,6 +1774,7 @@ public void nextVector(ColumnVector previousVector, // Allocate column vector for file; cast column vector for reader. longColVector = new DateColumnVector(batchSize); timestampColVector = (TimestampColumnVector) previousVector; + timestampColVector.setIsUTC(useUtc); } else { longColVector.ensureSize(batchSize, false); } diff --git a/java/core/src/java/org/apache/orc/impl/ParserUtils.java b/java/core/src/java/org/apache/orc/impl/ParserUtils.java index df2f8b5e19..c864465bde 100644 --- a/java/core/src/java/org/apache/orc/impl/ParserUtils.java +++ b/java/core/src/java/org/apache/orc/impl/ParserUtils.java @@ -31,6 +31,8 @@ import java.util.regex.Pattern; public class ParserUtils { + private static final TypeDescription.Category[] TYPE_DESCRIPTION_CATEGORY_VALUES + = TypeDescription.Category.values(); static TypeDescription.Category parseCategory(ParserUtils.StringPosition source) { StringBuilder word = new StringBuilder(); @@ -56,7 +58,7 @@ static TypeDescription.Category parseCategory(ParserUtils.StringPosition source) catString = catString.trim(); } if (!catString.isEmpty()) { - for (TypeDescription.Category cat : TypeDescription.Category.values()) { + for (TypeDescription.Category cat : TYPE_DESCRIPTION_CATEGORY_VALUES) { if (cat.getName().equals(catString)) { return cat; } diff --git a/java/core/src/java/org/apache/orc/impl/PhysicalFsWriter.java b/java/core/src/java/org/apache/orc/impl/PhysicalFsWriter.java index 4eb5f85623..87f777a7e1 100644 --- a/java/core/src/java/org/apache/orc/impl/PhysicalFsWriter.java +++ b/java/core/src/java/org/apache/orc/impl/PhysicalFsWriter.java @@ -116,8 +116,7 @@ public PhysicalFsWriter(FSDataOutputStream outputStream, CompressionCodec codec = OrcCodecPool.getCodec(opts.getCompress()); if (codec != null){ CompressionCodec.Options tempOptions = codec.getDefaultOptions(); - if (codec instanceof ZstdCodec && - codec.getDefaultOptions() instanceof ZstdCodec.ZstdOptions options) { + if (codec instanceof ZstdCodec && tempOptions instanceof ZstdCodec.ZstdOptions options) { OrcFile.ZstdCompressOptions zstdCompressOptions = opts.getZstdCompressOptions(); if (zstdCompressOptions != null) { options.setLevel(zstdCompressOptions.getCompressionZstdLevel()); diff --git a/java/core/src/java/org/apache/orc/impl/ReaderImpl.java b/java/core/src/java/org/apache/orc/impl/ReaderImpl.java index 3afbff5fc3..9e018157f6 100644 --- a/java/core/src/java/org/apache/orc/impl/ReaderImpl.java +++ b/java/core/src/java/org/apache/orc/impl/ReaderImpl.java @@ -65,6 +65,10 @@ public class ReaderImpl implements Reader { private static final Logger LOG = LoggerFactory.getLogger(ReaderImpl.class); + private static final OrcFile.Version[] ORC_FILE_VERSION_VALUES = OrcFile.Version.values(); + private static final OrcFile.WriterVersion[] ORC_FILE_WRITER_VERSION_VALUES + = OrcFile.WriterVersion.values(); + private static final int DIRECTORY_SIZE_GUESS = 16 * 1024; public static final int DEFAULT_COMPRESSION_BLOCK_SIZE = 256 * 1024; @@ -268,7 +272,7 @@ public static OrcFile.Version getFileVersion(List versionList) { if (versionList == null || versionList.isEmpty()) { return OrcFile.Version.V_0_11; } - for (OrcFile.Version version: OrcFile.Version.values()) { + for (OrcFile.Version version: ORC_FILE_VERSION_VALUES) { if (version.getMajor() == versionList.get(0) && version.getMinor() == versionList.get(1)) { return version; @@ -620,7 +624,7 @@ protected Supplier getFileSystemSupplier() { * @return the version of the software that produced the file */ public static OrcFile.WriterVersion getWriterVersion(int writerVersion) { - for(OrcFile.WriterVersion version: OrcFile.WriterVersion.values()) { + for(OrcFile.WriterVersion version: ORC_FILE_WRITER_VERSION_VALUES) { if (version.getId() == writerVersion) { return version; } diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java index 323f242471..5bd9809253 100644 --- a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java +++ b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java @@ -17,7 +17,6 @@ */ package org.apache.orc.impl; -import org.apache.commons.lang3.ArrayUtils; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.type.HiveDecimal; @@ -340,7 +339,14 @@ protected RecordReaderImpl(ReaderImpl fileReader, this.startReadPhase = TypeReader.ReadPhase.ALL; } - this.rowIndexColsToRead = ArrayUtils.contains(rowIndexCols, true) ? rowIndexCols : null; + var hasTrue = false; + for (boolean value: rowIndexCols) { + if (value) { + hasTrue = true; + break; + } + } + this.rowIndexColsToRead = hasTrue ? rowIndexCols : null; TreeReaderFactory.ReaderContext readerContext = new TreeReaderFactory.ReaderContext() .setSchemaEvolution(evolution) @@ -757,6 +763,13 @@ static TruthValue evaluatePredicateRange(PredicateLeaf predicate, if (!range.hasValues()) { if (predicate.getOperator() == PredicateLeaf.Operator.IS_NULL) { return TruthValue.YES; + } else if (predicate.getOperator() == PredicateLeaf.Operator.NULL_SAFE_EQUALS) { + Object literal = predicate.getLiteral(); + if (literal == null) { + return TruthValue.YES; + } else { + return TruthValue.NO; + } } else { return TruthValue.NULL; } diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderUtils.java b/java/core/src/java/org/apache/orc/impl/RecordReaderUtils.java index 0eabb421e0..e88cccc33a 100644 --- a/java/core/src/java/org/apache/orc/impl/RecordReaderUtils.java +++ b/java/core/src/java/org/apache/orc/impl/RecordReaderUtils.java @@ -17,12 +17,12 @@ */ package org.apache.orc.impl; -import org.apache.commons.lang3.builder.HashCodeBuilder; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileRange; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.io.DiskRangeList; +import org.apache.hadoop.util.VersionInfo; import org.apache.orc.CompressionCodec; import org.apache.orc.DataReader; import org.apache.orc.OrcProto; @@ -48,7 +48,8 @@ */ public class RecordReaderUtils { private static final HadoopShims SHIMS = HadoopShimsFactory.get(); - private static final boolean supportVectoredIO = SHIMS.supportVectoredIO(); + private static final boolean supportVectoredIO = + SHIMS.supportVectoredIO(VersionInfo.getVersion()); private static final Logger LOG = LoggerFactory.getLogger(RecordReaderUtils.class); private static class DefaultDataReader implements DataReader { @@ -302,9 +303,9 @@ public static boolean isDictionary(OrcProto.Stream.Kind kind, assert kind != OrcProto.Stream.Kind.DICTIONARY_COUNT; OrcProto.ColumnEncoding.Kind encodingKind = encoding.getKind(); return kind == OrcProto.Stream.Kind.DICTIONARY_DATA || - (kind == OrcProto.Stream.Kind.LENGTH && - (encodingKind == OrcProto.ColumnEncoding.Kind.DICTIONARY || - encodingKind == OrcProto.ColumnEncoding.Kind.DICTIONARY_V2)); + (kind == OrcProto.Stream.Kind.LENGTH && + (encodingKind == OrcProto.ColumnEncoding.Kind.DICTIONARY || + encodingKind == OrcProto.ColumnEncoding.Kind.DICTIONARY_V2)); } /** @@ -635,8 +636,8 @@ public boolean equals(Object rhs) { @Override public int hashCode() { - return new HashCodeBuilder().append(capacity).append(insertionGeneration) - .toHashCode(); + // This is idential to the previous hashCode from HashCodeBuilder + return (17 * 37 + capacity) * 37 + (int) (insertionGeneration ^ insertionGeneration >> 32); } } diff --git a/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java b/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java index 09b4b2ae61..eacff4b063 100644 --- a/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java +++ b/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java @@ -461,6 +461,8 @@ void buildConversion(TypeDescription fileType, case TIMESTAMP_INSTANT: case BINARY: case DATE: + case Geometry: + case Geography: // these are always a match break; case CHAR: diff --git a/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java b/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java index 2a2adf50d7..785f568ff4 100644 --- a/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java +++ b/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java @@ -1154,6 +1154,12 @@ public void skipRows(long items, ReadPhase readPhase) throws IOException { } } + public static class GeospatialTreeReader extends BinaryTreeReader { + GeospatialTreeReader(int columnId, Context context) throws IOException { + super(columnId, context); + } + } + public static class TimestampTreeReader extends TreeReader { protected IntegerReader data = null; protected IntegerReader nanos = null; @@ -1551,7 +1557,6 @@ private void nextVector(DecimalColumnVector result, HiveDecimalWritable[] vector = result.vector; HiveDecimalWritable decWritable; if (result.noNulls) { - result.isRepeating = true; for (int r = 0; r < batchSize; ++r) { decWritable = vector[r]; if (!decWritable.serializationUtilsRead( @@ -1563,7 +1568,6 @@ private void nextVector(DecimalColumnVector result, setIsRepeatingIfNeeded(result, r); } } else if (!result.isRepeating || !result.isNull[0]) { - result.isRepeating = true; for (int r = 0; r < batchSize; ++r) { if (!result.isNull[r]) { decWritable = vector[r]; @@ -1595,7 +1599,6 @@ private void nextVector(DecimalColumnVector result, HiveDecimalWritable[] vector = result.vector; HiveDecimalWritable decWritable; if (result.noNulls) { - result.isRepeating = true; int previousIdx = 0; for (int r = 0; r != filterContext.getSelectedSize(); ++r) { int idx = filterContext.getSelected()[r]; @@ -1614,7 +1617,6 @@ private void nextVector(DecimalColumnVector result, } skipStreamRows(batchSize - previousIdx); } else if (!result.isRepeating || !result.isNull[0]) { - result.isRepeating = true; int previousIdx = 0; for (int r = 0; r != filterContext.getSelectedSize(); ++r) { int idx = filterContext.getSelected()[r]; @@ -1651,14 +1653,12 @@ private void nextVector(Decimal64ColumnVector result, // read the scales scaleReader.nextVector(result, scratchScaleVector, batchSize); if (result.noNulls) { - result.isRepeating = true; for (int r = 0; r < batchSize; ++r) { final long scaleFactor = powerOfTenTable[scale - scratchScaleVector[r]]; result.vector[r] = SerializationUtils.readVslong(valueStream) * scaleFactor; setIsRepeatingIfNeeded(result, r); } } else if (!result.isRepeating || !result.isNull[0]) { - result.isRepeating = true; for (int r = 0; r < batchSize; ++r) { if (!result.isNull[r]) { final long scaleFactor = powerOfTenTable[scale - scratchScaleVector[r]]; @@ -1686,7 +1686,6 @@ private void nextVector(Decimal64ColumnVector result, // Read all the scales scaleReader.nextVector(result, scratchScaleVector, batchSize); if (result.noNulls) { - result.isRepeating = true; int previousIdx = 0; for (int r = 0; r != filterContext.getSelectedSize(); r++) { int idx = filterContext.getSelected()[r]; @@ -1702,7 +1701,6 @@ private void nextVector(Decimal64ColumnVector result, } skipStreamRows(batchSize - previousIdx); } else if (!result.isRepeating || !result.isNull[0]) { - result.isRepeating = true; int previousIdx = 0; for (int r = 0; r != filterContext.getSelectedSize(); r++) { int idx = filterContext.getSelected()[r]; @@ -3036,6 +3034,9 @@ public static TypeReader createTreeReader(TypeDescription readerType, } return new DecimalTreeReader(fileType.getId(), fileType.getPrecision(), fileType.getScale(), context); + case Geography: + case Geometry: + return new GeospatialTreeReader(fileType.getId(), context); case STRUCT: return new StructTreeReader(fileType.getId(), readerType, context); case LIST: diff --git a/java/core/src/java/org/apache/orc/impl/TypeUtils.java b/java/core/src/java/org/apache/orc/impl/TypeUtils.java index a5daa89572..40d22e2c43 100644 --- a/java/core/src/java/org/apache/orc/impl/TypeUtils.java +++ b/java/core/src/java/org/apache/orc/impl/TypeUtils.java @@ -69,6 +69,8 @@ public static ColumnVector createColumn(TypeDescription schema, case BINARY: case CHAR: case VARCHAR: + case Geometry: + case Geography: return new BytesColumnVector(maxSize); case STRUCT: { List children = schema.getChildren(); diff --git a/java/core/src/java/org/apache/orc/impl/ZlibCodec.java b/java/core/src/java/org/apache/orc/impl/ZlibCodec.java index 398ac0d16b..d4275a4c26 100644 --- a/java/core/src/java/org/apache/orc/impl/ZlibCodec.java +++ b/java/core/src/java/org/apache/orc/impl/ZlibCodec.java @@ -169,6 +169,17 @@ public void decompress(ByteBuffer in, ByteBuffer out) throws IOException { out.arrayOffset() + out.position(), out.remaining()); out.position(count + out.position()); + + if (!inflater.finished() && !inflater.needsDictionary() && !inflater.needsInput() && + count == 0) { + if (out.remaining() == 0) { + throw new IOException("Decompress output buffer too small. in = " + in + + ", out = " + out); + } else { + throw new IOException("Decompress error. in = " + in + + ", out = " + out); + } + } } catch (DataFormatException dfe) { throw new IOException("Bad compression data", dfe); } diff --git a/java/core/src/java/org/apache/orc/impl/ZstdCodec.java b/java/core/src/java/org/apache/orc/impl/ZstdCodec.java index 6703a82c19..d352c860f4 100644 --- a/java/core/src/java/org/apache/orc/impl/ZstdCodec.java +++ b/java/core/src/java/org/apache/orc/impl/ZstdCodec.java @@ -152,7 +152,7 @@ public int hashCode() { @Override public Options getDefaultOptions() { - return DEFAULT_OPTIONS; + return DEFAULT_OPTIONS.copy(); } /** @@ -165,7 +165,7 @@ public Options getDefaultOptions() { * @param out the compressed bytes * @param overflow put any additional bytes here * @param options the options to control compression - * @return ZstdOptions + * @return true if input data is compressed. Otherwise, false. */ @Override public boolean compress(ByteBuffer in, ByteBuffer out, diff --git a/java/core/src/java/org/apache/orc/impl/mask/RedactMaskFactory.java b/java/core/src/java/org/apache/orc/impl/mask/RedactMaskFactory.java index c6b65c3e8f..1debb93497 100644 --- a/java/core/src/java/org/apache/orc/impl/mask/RedactMaskFactory.java +++ b/java/core/src/java/org/apache/orc/impl/mask/RedactMaskFactory.java @@ -17,7 +17,6 @@ */ package org.apache.orc.impl.mask; -import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; @@ -135,7 +134,7 @@ public RedactMaskFactory(String... params) { OTHER_NUMBER_REPLACEMENT = getNextCodepoint(param, DEFAULT_NUMBER_OTHER); OTHER_REPLACEMENT = getNextCodepoint(param, DEFAULT_OTHER); String[] timeParams; - if (params.length < 2 || StringUtils.isBlank(params[1])) { + if (params.length < 2 || params[1].isBlank()) { timeParams = null; } else { timeParams = params[1].split("\\W+"); @@ -154,7 +153,7 @@ public RedactMaskFactory(String... params) { (SECOND_REPLACEMENT != UNMASKED_DATE); /* un-mask range */ - if(!(params.length < 3 || StringUtils.isBlank(params[2]))) { + if(!(params.length < 3 || params[2].isBlank())) { String[] unmaskIndexes = params[2].split(","); for(int i=0; i < unmaskIndexes.length; i++ ) { diff --git a/java/core/src/java/org/apache/orc/impl/reader/StripePlanner.java b/java/core/src/java/org/apache/orc/impl/reader/StripePlanner.java index 23afe89180..d796bcb0ad 100644 --- a/java/core/src/java/org/apache/orc/impl/reader/StripePlanner.java +++ b/java/core/src/java/org/apache/orc/impl/reader/StripePlanner.java @@ -206,7 +206,7 @@ public String getWriterTimezone() { public InStream getStream(StreamName name) throws IOException { StreamInformation stream = streams.get(name); return stream == null ? null - : InStream.create(name, stream.firstChunk, stream.offset, stream.length, + : InStream.create(name, stream.firstChunk, stream.offset, stream.length, getStreamOptions(stream.column, stream.kind)); } diff --git a/java/core/src/java/org/apache/orc/impl/writer/GeospatialTreeWriter.java b/java/core/src/java/org/apache/orc/impl/writer/GeospatialTreeWriter.java new file mode 100644 index 0000000000..676ca32a9b --- /dev/null +++ b/java/core/src/java/org/apache/orc/impl/writer/GeospatialTreeWriter.java @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.impl.writer; + +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.orc.OrcProto; +import org.apache.orc.TypeDescription; +import org.apache.orc.impl.CryptoUtils; +import org.apache.orc.impl.IntegerWriter; +import org.apache.orc.impl.PositionRecorder; +import org.apache.orc.impl.PositionedOutputStream; +import org.apache.orc.impl.StreamName; + +import java.io.IOException; +import java.util.function.Consumer; + +public class GeospatialTreeWriter extends TreeWriterBase { + private final PositionedOutputStream stream; + private final IntegerWriter length; + private boolean isDirectV2 = true; + private long rawDataSize = 0; + private boolean isGeometry = false; + + public GeospatialTreeWriter(TypeDescription schema, + WriterEncryptionVariant encryption, + WriterContext context) throws IOException { + super(schema, encryption, context); + this.isGeometry = schema.getCategory() == TypeDescription.Category.Geometry; + this.stream = context.createStream( + new StreamName(id, OrcProto.Stream.Kind.DATA, encryption)); + this.isDirectV2 = isNewWriteFormat(context); + this.length = createIntegerWriter(context.createStream( + new StreamName(id, OrcProto.Stream.Kind.LENGTH, encryption)), + false, isDirectV2, context); + if (rowIndexPosition != null) { + recordPosition(rowIndexPosition); + } + } + + @Override + OrcProto.ColumnEncoding.Builder getEncoding() { + OrcProto.ColumnEncoding.Builder result = super.getEncoding(); + if (isDirectV2) { + result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2); + } else { + result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT); + } + return result; + } + + @Override + public void writeBatch(ColumnVector vector, int offset, + int length) throws IOException { + super.writeBatch(vector, offset, length); + BytesColumnVector vec = (BytesColumnVector) vector; + if (vector.isRepeating) { + if (vector.noNulls || !vector.isNull[0]) { + for (int i = 0; i < length; ++i) { + stream.write(vec.vector[0], vec.start[0], + vec.length[0]); + this.length.write(vec.length[0]); + } + rawDataSize += (long) length * vec.length[0]; + if (isGeometry) { + indexStatistics.updateGeometry(vec.vector[0], vec.start[0], vec.length[0]); + } + if (createBloomFilter) { + if (bloomFilter != null) { + bloomFilter.addBytes(vec.vector[0], vec.start[0], vec.length[0]); + } + bloomFilterUtf8.addBytes(vec.vector[0], vec.start[0], vec.length[0]); + } + } + } else { + for (int i = 0; i < length; ++i) { + if (vec.noNulls || !vec.isNull[i + offset]) { + stream.write(vec.vector[offset + i], + vec.start[offset + i], vec.length[offset + i]); + this.length.write(vec.length[offset + i]); + rawDataSize += vec.length[offset + i]; + if (isGeometry) { + indexStatistics.updateGeometry(vec.vector[offset + i], + vec.start[offset + i], vec.length[offset + i]); + } + if (createBloomFilter) { + if (bloomFilter != null) { + bloomFilter.addBytes(vec.vector[offset + i], + vec.start[offset + i], vec.length[offset + i]); + } + bloomFilterUtf8.addBytes(vec.vector[offset + i], + vec.start[offset + i], vec.length[offset + i]); + } + } + } + } + } + + @Override + public void writeStripe(int requiredIndexEntries) throws IOException { + super.writeStripe(requiredIndexEntries); + if (rowIndexPosition != null) { + recordPosition(rowIndexPosition); + } + } + + @Override + void recordPosition(PositionRecorder recorder) throws IOException { + super.recordPosition(recorder); + stream.getPosition(recorder); + length.getPosition(recorder); + } + + @Override + public long estimateMemory() { + return super.estimateMemory() + stream.getBufferSize() + + length.estimateMemory(); + } + + @Override + public long getRawDataSize() { + return rawDataSize; + } + + @Override + public void flushStreams() throws IOException { + super.flushStreams(); + stream.flush(); + length.flush(); + } + + @Override + public void prepareStripe(int stripeId) { + super.prepareStripe(stripeId); + Consumer updater = CryptoUtils.modifyIvForStripe(stripeId); + stream.changeIv(updater); + length.changeIv(updater); + } +} diff --git a/java/core/src/java/org/apache/orc/impl/writer/TreeWriter.java b/java/core/src/java/org/apache/orc/impl/writer/TreeWriter.java index 71eb3a5648..de63f9efb6 100644 --- a/java/core/src/java/org/apache/orc/impl/writer/TreeWriter.java +++ b/java/core/src/java/org/apache/orc/impl/writer/TreeWriter.java @@ -185,6 +185,9 @@ static TreeWriter createSubtree(TypeDescription schema, return new ListTreeWriter(schema, encryption, streamFactory); case UNION: return new UnionTreeWriter(schema, encryption, streamFactory); + case Geometry: + case Geography: + return new GeospatialTreeWriter(schema, encryption, streamFactory); default: throw new IllegalArgumentException("Bad category: " + schema.getCategory()); diff --git a/java/core/src/test/org/apache/orc/TestColumnStatistics.java b/java/core/src/test/org/apache/orc/TestColumnStatistics.java index 2ef96e5f50..dea3359d92 100644 --- a/java/core/src/test/org/apache/orc/TestColumnStatistics.java +++ b/java/core/src/test/org/apache/orc/TestColumnStatistics.java @@ -20,18 +20,22 @@ import org.apache.commons.lang3.RandomStringUtils; import org.apache.commons.lang3.StringEscapeUtils; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.serde2.io.DateWritable; import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; +import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; +import org.apache.orc.geospatial.BoundingBox; +import org.apache.orc.geospatial.GeospatialTypes; import org.apache.orc.impl.ColumnStatisticsImpl; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.TestInfo; +import org.locationtech.jts.geom.*; +import org.locationtech.jts.io.WKBWriter; import java.io.File; import java.math.BigDecimal; @@ -44,12 +48,13 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; /** * Test ColumnStatisticsImpl for ORC. */ -public class TestColumnStatistics { +public class TestColumnStatistics implements TestConf { @Test public void testLongSumOverflow() { @@ -699,20 +704,253 @@ public void testDecimalMinMaxStatistics() throws Exception { "Incorrect minimum value"); } + @Test + public void testBinaryMerge() { + TypeDescription schema = TypeDescription.createBinary(); + + ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema); + ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema); + stats1.increment(3); + stats1.updateBinary(new BytesWritable("bob".getBytes(StandardCharsets.UTF_8))); + stats1.updateBinary(new BytesWritable("david".getBytes(StandardCharsets.UTF_8))); + stats1.updateBinary(new BytesWritable("charles".getBytes(StandardCharsets.UTF_8))); + stats2.increment(2); + stats2.updateBinary(new BytesWritable("anne".getBytes(StandardCharsets.UTF_8))); + stats2.updateBinary(new BytesWritable("abcdef".getBytes(StandardCharsets.UTF_8))); + + assertEquals(15, ((BinaryColumnStatistics) stats1).getSum()); + assertEquals(10, ((BinaryColumnStatistics) stats2).getSum()); + + stats1.merge(stats2); + + assertEquals(25, ((BinaryColumnStatistics) stats1).getSum()); + } + + @Test + public void testMergeIncompatible() { + TypeDescription stringSchema = TypeDescription.createString(); + ColumnStatisticsImpl stringStats = ColumnStatisticsImpl.create(stringSchema); + + TypeDescription doubleSchema = TypeDescription.createDouble(); + ColumnStatisticsImpl doubleStats = ColumnStatisticsImpl.create(doubleSchema); + + stringStats.increment(3); + stringStats.updateString(new Text("bob")); + stringStats.updateString(new Text("david")); + stringStats.updateString(new Text("charles")); + + assertThrows(IllegalArgumentException.class, () -> { + doubleStats.merge(stringStats); + }); + + assertEquals(0, ((DoubleColumnStatistics) doubleStats).getNumberOfValues()); + } + + @Test + public void testUpdateGeometry() { + TypeDescription desc = TypeDescription.createGeometry(); + ColumnStatisticsImpl stats = ColumnStatisticsImpl.create(desc); + GeometryFactory geometryFactory = new GeometryFactory(); + WKBWriter wkbWriter = new WKBWriter(); + + byte[][] points = { + wkbWriter.write(geometryFactory.createPoint(new Coordinate(1.0, 1.0))), + wkbWriter.write(geometryFactory.createPoint(new Coordinate(2.0, 2.0))), + }; + + for (byte[] point : points) { + stats.updateGeometry(new BytesWritable(point)); + } + + GeospatialColumnStatistics geometryStatistics = (GeospatialColumnStatistics) stats; + BoundingBox bbox = geometryStatistics.getBoundingBox(); + assertEquals(1.0, bbox.getXMin(), 0.0); + assertEquals(2.0, bbox.getXMax(), 0.0); + assertEquals(1.0, bbox.getYMin(), 0.0); + assertEquals(2.0, bbox.getYMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getZMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getZMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getMMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getMMax(), 0.0); + assertEquals("BoundingBox{xMin=1.0, xMax=2.0, yMin=1.0, yMax=2.0, zMin=Infinity, zMax=-Infinity, mMin=Infinity, mMax=-Infinity}", + bbox.toString()); + assertEquals("count: 0 hasNull: false bbox: BoundingBox{xMin=1.0, xMax=2.0, yMin=1.0, yMax=2.0, zMin=Infinity, zMax=-Infinity, mMin=Infinity, mMax=-Infinity} types: GeospatialTypes{types=[Point (XY)]}", + geometryStatistics.toString()); + + GeospatialTypes geospatialTypes = geometryStatistics.getGeospatialTypes(); + assertTrue(geospatialTypes.getTypes().contains(1)); + assertEquals(1, geospatialTypes.getTypes().size()); + } + + @Test + public void testUpdateGeometryWithDifferentTypes() { + TypeDescription desc = TypeDescription.createGeometry(); + ColumnStatisticsImpl stats = ColumnStatisticsImpl.create(desc); + GeometryFactory geometryFactory = new GeometryFactory(); + WKBWriter wkbWriter = new WKBWriter(); + + Point point = geometryFactory.createPoint(new Coordinate(1, 1)); + Coordinate[] lineCoords = new Coordinate[]{new Coordinate(1, 1), new Coordinate(2, 2)}; + LineString line = geometryFactory.createLineString(lineCoords); + Coordinate[] polygonCoords = new Coordinate[]{ + new Coordinate(0, 0), new Coordinate(3, 0), + new Coordinate(1, 3), new Coordinate(0, 1), + new Coordinate(0, 0) + }; + LinearRing shell = geometryFactory.createLinearRing(polygonCoords); + Polygon polygon = geometryFactory.createPolygon(shell); + + GeospatialColumnStatistics geometryStatistics = (GeospatialColumnStatistics) stats; + BoundingBox bbox = geometryStatistics.getBoundingBox(); + GeospatialTypes geospatialTypes = geometryStatistics.getGeospatialTypes(); + // Generate WKB and update stats + byte[] pointWkb = wkbWriter.write(point); + stats.updateGeometry(new BytesWritable(pointWkb)); + + assertEquals(1.0, bbox.getXMin(), 0.0); + assertEquals(1.0, bbox.getXMax(), 0.0); + assertEquals(1.0, bbox.getYMin(), 0.0); + assertEquals(1.0, bbox.getYMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getZMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getZMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getMMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getMMax(), 0.0); + assertEquals("BoundingBox{xMin=1.0, xMax=1.0, yMin=1.0, yMax=1.0, zMin=Infinity, zMax=-Infinity, mMin=Infinity, mMax=-Infinity}", + bbox.toString()); + + assertTrue(geospatialTypes.getTypes().contains(1)); + assertEquals(1, geospatialTypes.getTypes().size()); + assertEquals("GeospatialTypes{types=[Point (XY)]}", geospatialTypes.toString()); + + + assertEquals("count: 0 hasNull: false bbox: BoundingBox{xMin=1.0, xMax=1.0, yMin=1.0, yMax=1.0, zMin=Infinity, zMax=-Infinity, mMin=Infinity, mMax=-Infinity} types: GeospatialTypes{types=[Point (XY)]}", + geometryStatistics.toString()); + + byte[] lineWkb = wkbWriter.write(line); + stats.updateGeometry(new BytesWritable(lineWkb)); + assertEquals(1.0, bbox.getXMin(), 0.0); + assertEquals(2.0, bbox.getXMax(), 0.0); + assertEquals(1.0, bbox.getYMin(), 0.0); + assertEquals(2.0, bbox.getYMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getZMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getZMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getMMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getMMax(), 0.0); + + assertTrue(geospatialTypes.getTypes().contains(1)); + assertTrue(geospatialTypes.getTypes().contains(2)); + assertEquals(2, geospatialTypes.getTypes().size()); + assertEquals("GeospatialTypes{types=[Point (XY), LineString (XY)]}", + geospatialTypes.toString()); + + byte[] polygonWkb = wkbWriter.write(polygon); + stats.updateGeometry(new BytesWritable(polygonWkb)); + stats.updateGeometry(new BytesWritable(lineWkb)); + assertEquals(0.0, bbox.getXMin(), 0.0); + assertEquals(3.0, bbox.getXMax(), 0.0); + assertEquals(0.0, bbox.getYMin(), 0.0); + assertEquals(3.0, bbox.getYMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getZMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getZMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getMMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getMMax(), 0.0); + + assertTrue(geospatialTypes.getTypes().contains(1)); + assertTrue(geospatialTypes.getTypes().contains(2)); + assertTrue(geospatialTypes.getTypes().contains(2)); + assertEquals(3, geospatialTypes.getTypes().size()); + assertEquals("GeospatialTypes{types=[Point (XY), LineString (XY), Polygon (XY)]}", + geospatialTypes.toString()); + + } + + @Test + public void testUpdateGeometryWithZCoordinates() { + TypeDescription desc = TypeDescription.createGeometry(); + ColumnStatisticsImpl stats = ColumnStatisticsImpl.create(desc); + GeometryFactory geometryFactory = new GeometryFactory(); + WKBWriter wkbWriter = new WKBWriter(3); + + Point point1 = geometryFactory.createPoint(new Coordinate(0, 1, 2)); + Point point2 = geometryFactory.createPoint(new Coordinate(2, 1, 0)); + + stats.updateGeometry(new BytesWritable(wkbWriter.write(point1))); + stats.updateGeometry(new BytesWritable(wkbWriter.write(point2))); + + GeospatialColumnStatistics geometryStatistics = (GeospatialColumnStatistics) stats; + BoundingBox bbox = geometryStatistics.getBoundingBox(); + assertEquals(0.0, bbox.getXMin(), 0.0); + assertEquals(2.0, bbox.getXMax(), 0.0); + assertEquals(1.0, bbox.getYMin(), 0.0); + assertEquals(1.0, bbox.getYMax(), 0.0); + assertEquals(0.0, bbox.getZMin(), 0.0); + assertEquals(2.0, bbox.getZMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getMMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getMMax(), 0.0); + assertEquals("BoundingBox{xMin=0.0, xMax=2.0, yMin=1.0, yMax=1.0, zMin=0.0, zMax=2.0, mMin=Infinity, mMax=-Infinity}", + bbox.toString()); + assertEquals("count: 0 hasNull: false bbox: BoundingBox{xMin=0.0, xMax=2.0, yMin=1.0, yMax=1.0, zMin=0.0, zMax=2.0, mMin=Infinity, mMax=-Infinity} types: GeospatialTypes{types=[Point (XYZ)]}", + geometryStatistics.toString()); + + GeospatialTypes geospatialTypes = geometryStatistics.getGeospatialTypes(); + assertTrue(geospatialTypes.getTypes().contains(1001)); + assertEquals(1, geospatialTypes.getTypes().size()); + } + + @Test + public void TestGeospatialMerge() { + TypeDescription desc = TypeDescription.createGeometry(); + ColumnStatisticsImpl stats0 = ColumnStatisticsImpl.create(desc); + ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(desc); + GeometryFactory geometryFactory = new GeometryFactory(); + WKBWriter wkbWriter = new WKBWriter(); + + byte[][] points = { + wkbWriter.write(geometryFactory.createPoint(new Coordinate(1.0, 1.0))), + wkbWriter.write(geometryFactory.createPoint(new Coordinate(2.0, 2.0))), + }; + + stats0.updateGeometry(new BytesWritable(points[0])); + stats1.updateGeometry(new BytesWritable(points[1])); + + GeospatialColumnStatistics geometryStatistics = (GeospatialColumnStatistics) stats0; + stats0.merge(stats1); + + BoundingBox bbox = geometryStatistics.getBoundingBox(); + assertTrue(bbox.isXYValid()); + assertFalse(bbox.isXYEmpty()); + assertTrue(bbox.isZValid()); + assertTrue(bbox.isMValid()); + assertTrue(bbox.isZEmpty()); + assertTrue(bbox.isMEmpty()); + assertEquals(1.0, bbox.getXMin(), 0.0); + assertEquals(2.0, bbox.getXMax(), 0.0); + assertEquals(1.0, bbox.getYMin(), 0.0); + assertEquals(2.0, bbox.getYMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getZMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getZMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getMMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getMMax(), 0.0); + assertEquals("BoundingBox{xMin=1.0, xMax=2.0, yMin=1.0, yMax=2.0, zMin=Infinity, zMax=-Infinity, mMin=Infinity, mMax=-Infinity}", + bbox.toString()); + assertEquals("count: 0 hasNull: false bbox: BoundingBox{xMin=1.0, xMax=2.0, yMin=1.0, yMax=2.0, zMin=Infinity, zMax=-Infinity, mMin=Infinity, mMax=-Infinity} types: GeospatialTypes{types=[Point (XY)]}", + geometryStatistics.toString()); + + GeospatialTypes geospatialTypes = geometryStatistics.getGeospatialTypes(); + assertTrue(geospatialTypes.getTypes().contains(1)); + assertEquals(1, geospatialTypes.getTypes().size()); + } Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" + File.separator + "tmp")); - Configuration conf; FileSystem fs; Path testFilePath; @BeforeEach public void openFileSystem(TestInfo testInfo) throws Exception { - conf = new Configuration(); fs = FileSystem.getLocal(conf); - fs.setWorkingDirectory(workDir); - testFilePath = new Path( + testFilePath = new Path(workDir + File.separator + "TestOrcFile." + testInfo.getTestMethod().get().getName() + ".orc"); fs.delete(testFilePath, false); } diff --git a/java/core/src/test/org/apache/orc/TestConf.java b/java/core/src/test/org/apache/orc/TestConf.java new file mode 100644 index 0000000000..aedbeb8d47 --- /dev/null +++ b/java/core/src/test/org/apache/orc/TestConf.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.orc; + +import org.apache.hadoop.conf.Configuration; +import org.junit.jupiter.api.BeforeEach; + +/** + * A shared configuration for ORC tests. + */ +public interface TestConf { + + Configuration conf = getNewConf(); + + @BeforeEach + default void clear() { + conf.clear(); + conf.setIfUnset("fs.defaultFS", "file:///"); + conf.setIfUnset("fs.file.impl.disable.cache", "true"); + } + + private static Configuration getNewConf() { + Configuration conf = new Configuration(); + conf.setIfUnset("fs.defaultFS", "file:///"); + conf.setIfUnset("fs.file.impl.disable.cache", "true"); + return conf; + } +} diff --git a/java/core/src/test/org/apache/orc/TestMinSeekSize.java b/java/core/src/test/org/apache/orc/TestMinSeekSize.java index 8e69bf678c..0040501fc4 100644 --- a/java/core/src/test/org/apache/orc/TestMinSeekSize.java +++ b/java/core/src/test/org/apache/orc/TestMinSeekSize.java @@ -18,7 +18,6 @@ package org.apache.orc; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; @@ -40,13 +39,12 @@ import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; -public class TestMinSeekSize { +public class TestMinSeekSize implements TestConf { private static final Logger LOG = LoggerFactory.getLogger(TestMinSeekSize.class); private static final Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" + File.separator + "tmp")); private static final Path filePath = new Path(workDir, "min_seek_size_file.orc"); - private static Configuration conf; private static FileSystem fs; private static final TypeDescription schema = TypeDescription.createStruct() @@ -62,7 +60,6 @@ public class TestMinSeekSize { @BeforeAll public static void setup() throws IOException { - conf = new Configuration(); fs = FileSystem.get(conf); LOG.info("Creating file {} with schema {}", filePath, schema); diff --git a/java/core/src/test/org/apache/orc/TestNewIntegerEncoding.java b/java/core/src/test/org/apache/orc/TestNewIntegerEncoding.java index 7e1b1aa898..75508c3ad2 100644 --- a/java/core/src/test/org/apache/orc/TestNewIntegerEncoding.java +++ b/java/core/src/test/org/apache/orc/TestNewIntegerEncoding.java @@ -19,7 +19,6 @@ import com.google.common.collect.Lists; import com.google.common.primitives.Longs; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; @@ -42,7 +41,7 @@ import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; -public class TestNewIntegerEncoding { +public class TestNewIntegerEncoding implements TestConf { private static Stream data() { return Stream.of( @@ -72,13 +71,11 @@ public static void appendLong(VectorizedRowBatch batch, Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" + File.separator + "tmp")); - Configuration conf; FileSystem fs; Path testFilePath; @BeforeEach public void openFileSystem(TestInfo testInfo) throws Exception { - conf = new Configuration(); fs = FileSystem.getLocal(conf); testFilePath = new Path(workDir, "TestOrcFile." + testInfo.getTestMethod().get().getName() + ".orc"); diff --git a/java/core/src/test/org/apache/orc/TestOrcDSTNoTimezone.java b/java/core/src/test/org/apache/orc/TestOrcDSTNoTimezone.java index eb9095d609..49529add61 100644 --- a/java/core/src/test/org/apache/orc/TestOrcDSTNoTimezone.java +++ b/java/core/src/test/org/apache/orc/TestOrcDSTNoTimezone.java @@ -17,7 +17,6 @@ */ package org.apache.orc; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; @@ -40,15 +39,13 @@ * and it was written from a time zone that observes DST for one of the timestamp * values stored ('2014-06-06 12:34:56.0'). */ -public class TestOrcDSTNoTimezone { - Configuration conf; +public class TestOrcDSTNoTimezone implements TestConf { FileSystem fs; SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.S"); static TimeZone defaultTimeZone = TimeZone.getDefault(); @BeforeEach public void openFileSystem() throws Exception { - conf = new Configuration(); fs = FileSystem.getLocal(conf); } diff --git a/java/core/src/test/org/apache/orc/TestOrcFilterContext.java b/java/core/src/test/org/apache/orc/TestOrcFilterContext.java index 265956890d..a8916a2568 100644 --- a/java/core/src/test/org/apache/orc/TestOrcFilterContext.java +++ b/java/core/src/test/org/apache/orc/TestOrcFilterContext.java @@ -29,7 +29,6 @@ import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf; import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.orc.impl.OrcFilterContextImpl; @@ -47,7 +46,7 @@ import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; -public class TestOrcFilterContext { +public class TestOrcFilterContext implements TestConf { private final TypeDescription schema = TypeDescription.createStruct() .addField("f1", TypeDescription.createLong()) .addField("f2", TypeDescription.createString()) @@ -74,7 +73,6 @@ public class TestOrcFilterContext { TypeDescription.createList(TypeDescription.createChar())) ) ); - private static Configuration configuration; private static FileSystem fileSystem; private static final Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" @@ -270,11 +268,10 @@ public void testRowFilterWithACIDTable() throws IOException { } private void createAcidORCFile() throws IOException { - configuration = new Configuration(); - fileSystem = FileSystem.get(configuration); + fileSystem = FileSystem.get(conf); try (Writer writer = OrcFile.createWriter(filePath, - OrcFile.writerOptions(configuration) + OrcFile.writerOptions(conf) .fileSystem(fileSystem) .overwrite(true) .rowIndexStride(8192) @@ -325,7 +322,7 @@ private void populateColumnValues(TypeDescription typeDescription, ColumnVector[ } private void readSingleRowWithFilter(int id) throws IOException { - Reader reader = OrcFile.createReader(filePath, OrcFile.readerOptions(configuration).filesystem(fileSystem)); + Reader reader = OrcFile.createReader(filePath, OrcFile.readerOptions(conf).filesystem(fileSystem)); SearchArgument searchArgument = SearchArgumentFactory.newBuilder() .in("int1", PredicateLeaf.Type.LONG, new Long(id)) .build(); diff --git a/java/core/src/test/org/apache/orc/TestOrcGeospatial.java b/java/core/src/test/org/apache/orc/TestOrcGeospatial.java new file mode 100644 index 0000000000..f0e148fb0d --- /dev/null +++ b/java/core/src/test/org/apache/orc/TestOrcGeospatial.java @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.orc.impl.ColumnStatisticsImpl; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestInfo; +import org.locationtech.jts.geom.Coordinate; +import org.locationtech.jts.geom.Geometry; +import org.locationtech.jts.geom.GeometryFactory; +import org.locationtech.jts.io.WKBReader; +import org.locationtech.jts.io.WKBWriter; + +import java.io.File; +import java.util.Arrays; + +import static org.junit.jupiter.api.Assertions.*; + +public class TestOrcGeospatial implements TestConf { + Path workDir = new Path(System.getProperty("test.tmp.dir", + "target" + File.separator + "test" + File.separator + "tmp")); + FileSystem fs; + Path testFilePath; + + @BeforeEach + public void openFileSystem(TestInfo testInfo) throws Exception { + fs = FileSystem.getLocal(conf); + testFilePath = new Path(workDir, "TestOrcGeospatial." + + testInfo.getTestMethod().get().getName() + ".orc"); + fs.delete(testFilePath, false); + } + + @Test + public void testGeometryWriterWithNulls() throws Exception { + // Create a geometry schema and ORC file writer + TypeDescription schema = TypeDescription.createGeometry(); + Writer writer = OrcFile.createWriter(testFilePath, + OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000) + .bufferSize(10000)); + GeometryFactory geometryFactory = new GeometryFactory(); + WKBWriter wkbWriter = new WKBWriter(); + WKBReader wkbReader = new WKBReader(); + + // Add data + VectorizedRowBatch batch = schema.createRowBatch(); + BytesColumnVector geos = (BytesColumnVector) batch.cols[0]; + for (int i = 0; i < 100; i++) { + if (i % 2 == 0) { + byte[] bytes = wkbWriter.write(geometryFactory.createPoint(new Coordinate(i, i))); + geos.setVal(batch.size++, bytes); + } else { + geos.noNulls = false; + geos.isNull[batch.size++] = true; + } + } + writer.addRowBatch(batch); + writer.close(); + + // Verify reader schema + Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); + assertEquals("geometry(OGC:CRS84)", reader.getSchema().toString()); + assertEquals(100, reader.getNumberOfRows()); + RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); + geos = (BytesColumnVector) batch.cols[0]; + + // Verify statistics + ColumnStatistics[] stats = reader.getStatistics(); + assertEquals(1, stats.length); + assertEquals(50, stats[0].getNumberOfValues()); + assertTrue(stats[0].hasNull()); + assertInstanceOf(GeospatialColumnStatistics.class, stats[0]); + assertTrue(((GeospatialColumnStatistics) stats[0]).getBoundingBox().isXYValid()); + assertFalse(((GeospatialColumnStatistics) stats[0]).getBoundingBox().isZValid()); + assertFalse(((GeospatialColumnStatistics) stats[0]).getBoundingBox().isMValid()); + assertEquals("BoundingBox{xMin=0.0, xMax=98.0, yMin=0.0, yMax=98.0, zMin=NaN, zMax=NaN, mMin=NaN, mMax=NaN}", ((GeospatialColumnStatistics) stats[0]).getBoundingBox().toString()); + assertEquals("GeospatialTypes{types=[Point (XY)]}", ((GeospatialColumnStatistics) stats[0]).getGeospatialTypes().toString()); + + // Verify data + int idx = 0; + while (rows.nextBatch(batch)) { + for (int r = 0; r < batch.size; ++r) { + if (idx % 2 == 0) { + Geometry geom = wkbReader.read(Arrays.copyOfRange(geos.vector[r], geos.start[r], geos.start[r] + geos.length[r])); + assertEquals("Point", geom.getGeometryType()); + assertEquals(geom, geometryFactory.createPoint(new Coordinate(idx, idx))); + } else { + assertTrue(geos.isNull[r]); + } + idx += 1; + } + } + rows.close(); + } + + @Test + public void testGeographyWriterWithNulls() throws Exception { + // Create geography schema and ORC file writer + TypeDescription schema = TypeDescription.createGeography(); + Writer writer = OrcFile.createWriter(testFilePath, + OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000) + .bufferSize(10000)); + GeometryFactory geometryFactory = new GeometryFactory(); + WKBWriter wkbWriter = new WKBWriter(); + WKBReader wkbReader = new WKBReader(); + + // Add data + VectorizedRowBatch batch = schema.createRowBatch(); + BytesColumnVector geos = (BytesColumnVector) batch.cols[0]; + for (int i = 0; i < 100; i++) { + if (i % 2 == 0) { + byte[] bytes = wkbWriter.write(geometryFactory.createPoint(new Coordinate(i, i))); + geos.setVal(batch.size++, bytes); + } else { + geos.noNulls = false; + geos.isNull[batch.size++] = true; + } + } + writer.addRowBatch(batch); + writer.close(); + + // Verify reader schema + Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); + assertEquals("geography(OGC:CRS84,SPHERICAL)", reader.getSchema().toString()); + assertEquals(100, reader.getNumberOfRows()); + + // Verify statistics, make sure there are no bounding box and geospatial types + ColumnStatistics[] stats = reader.getStatistics(); + assertEquals(1, stats.length); + assertEquals(50, stats[0].getNumberOfValues()); + assertTrue(stats[0].hasNull()); + assertInstanceOf(GeospatialColumnStatistics.class, stats[0]); + assertNull(((GeospatialColumnStatistics) stats[0]).getBoundingBox()); + assertNull(((GeospatialColumnStatistics) stats[0]).getGeospatialTypes()); + + // Verify Data + RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); + geos = (BytesColumnVector) batch.cols[0]; + int idx = 0; + while (rows.nextBatch(batch)) { + for (int r = 0; r < batch.size; ++r) { + if (idx % 2 == 0) { + Geometry geom = wkbReader.read(Arrays.copyOfRange(geos.vector[r], geos.start[r], geos.start[r] + geos.length[r])); + assertEquals("Point", geom.getGeometryType()); + assertEquals(geom, geometryFactory.createPoint(new Coordinate(idx, idx))); + } else { + assertTrue(geos.isNull[r]); + } + idx += 1; + } + } + rows.close(); + } +} diff --git a/java/core/src/test/org/apache/orc/TestOrcNoTimezone.java b/java/core/src/test/org/apache/orc/TestOrcNoTimezone.java index 1b72e33e10..5a53738623 100644 --- a/java/core/src/test/org/apache/orc/TestOrcNoTimezone.java +++ b/java/core/src/test/org/apache/orc/TestOrcNoTimezone.java @@ -17,7 +17,6 @@ */ package org.apache.orc; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; @@ -39,15 +38,13 @@ * Test over an orc file that does not store time zone information in the footer * and it was written from a time zone that does not observe DST. */ -public class TestOrcNoTimezone { - Configuration conf; +public class TestOrcNoTimezone implements TestConf { FileSystem fs; SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.S"); static TimeZone defaultTimeZone = TimeZone.getDefault(); @BeforeEach public void openFileSystem() throws Exception { - conf = new Configuration(); fs = FileSystem.getLocal(conf); } diff --git a/java/core/src/test/org/apache/orc/TestOrcNullOptimization.java b/java/core/src/test/org/apache/orc/TestOrcNullOptimization.java index b509b8a254..79473063ca 100644 --- a/java/core/src/test/org/apache/orc/TestOrcNullOptimization.java +++ b/java/core/src/test/org/apache/orc/TestOrcNullOptimization.java @@ -18,7 +18,6 @@ package org.apache.orc; import com.google.common.collect.Lists; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; @@ -43,7 +42,7 @@ import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; -public class TestOrcNullOptimization { +public class TestOrcNullOptimization implements TestConf { TypeDescription createMyStruct() { return TypeDescription.createStruct() @@ -103,13 +102,11 @@ void addRow(Writer writer, VectorizedRowBatch batch, Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" + File.separator + "tmp")); - Configuration conf; FileSystem fs; Path testFilePath; @BeforeEach public void openFileSystem(TestInfo testInfo) throws Exception { - conf = new Configuration(); fs = FileSystem.getLocal(conf); testFilePath = new Path(workDir, "TestOrcNullOptimization." + testInfo.getTestMethod().get().getName() + ".orc"); diff --git a/java/core/src/test/org/apache/orc/TestOrcTimestampPPD.java b/java/core/src/test/org/apache/orc/TestOrcTimestampPPD.java index 0803d890bb..142c7423a4 100644 --- a/java/core/src/test/org/apache/orc/TestOrcTimestampPPD.java +++ b/java/core/src/test/org/apache/orc/TestOrcTimestampPPD.java @@ -18,7 +18,6 @@ package org.apache.orc; import com.google.common.collect.Lists; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; @@ -40,10 +39,9 @@ import static org.junit.jupiter.api.Assertions.assertEquals; -public class TestOrcTimestampPPD { +public class TestOrcTimestampPPD implements TestConf { Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" + File.separator + "tmp")); - Configuration conf; FileSystem fs; Path testFilePath; static TimeZone defaultTimeZone = TimeZone.getDefault(); @@ -53,7 +51,6 @@ public TestOrcTimestampPPD() { @BeforeEach public void openFileSystem(TestInfo testInfo) throws Exception { - conf = new Configuration(); fs = FileSystem.getLocal(conf); testFilePath = new Path(workDir, "TestOrcTimestampPPD." + testInfo.getTestMethod().get().getName() + ".orc"); diff --git a/java/core/src/test/org/apache/orc/TestOrcTimezone1.java b/java/core/src/test/org/apache/orc/TestOrcTimezone1.java index fe871b9c45..e9ccb38314 100644 --- a/java/core/src/test/org/apache/orc/TestOrcTimezone1.java +++ b/java/core/src/test/org/apache/orc/TestOrcTimezone1.java @@ -18,7 +18,6 @@ package org.apache.orc; import com.google.common.collect.Lists; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; @@ -43,10 +42,9 @@ /** * */ -public class TestOrcTimezone1 { +public class TestOrcTimezone1 implements TestConf { Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" + File.separator + "tmp")); - Configuration conf; FileSystem fs; Path testFilePath; static TimeZone defaultTimeZone = TimeZone.getDefault(); @@ -79,7 +77,6 @@ private static Stream data() { @BeforeEach public void openFileSystem(TestInfo testInfo) throws Exception { - conf = new Configuration(); fs = FileSystem.getLocal(conf); testFilePath = new Path(workDir, "TestOrcFile." + testInfo.getTestMethod().get().getName() + ".orc"); diff --git a/java/core/src/test/org/apache/orc/TestOrcTimezone2.java b/java/core/src/test/org/apache/orc/TestOrcTimezone2.java index 69b6d676b0..488cc2d26c 100644 --- a/java/core/src/test/org/apache/orc/TestOrcTimezone2.java +++ b/java/core/src/test/org/apache/orc/TestOrcTimezone2.java @@ -18,7 +18,6 @@ package org.apache.orc; import com.google.common.collect.Lists; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; @@ -42,10 +41,9 @@ /** * */ -public class TestOrcTimezone2 { +public class TestOrcTimezone2 implements TestConf { Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" + File.separator + "tmp")); - Configuration conf; FileSystem fs; Path testFilePath; static TimeZone defaultTimeZone = TimeZone.getDefault(); @@ -66,7 +64,6 @@ private static Stream data() { @BeforeEach public void openFileSystem(TestInfo testInfo) throws Exception { - conf = new Configuration(); fs = FileSystem.getLocal(conf); testFilePath = new Path(workDir, "TestOrcFile." + testInfo.getTestMethod().get().getName() + ".orc"); diff --git a/java/core/src/test/org/apache/orc/TestOrcTimezone3.java b/java/core/src/test/org/apache/orc/TestOrcTimezone3.java index 112d5dedd6..f8a16b16b1 100644 --- a/java/core/src/test/org/apache/orc/TestOrcTimezone3.java +++ b/java/core/src/test/org/apache/orc/TestOrcTimezone3.java @@ -18,7 +18,6 @@ package org.apache.orc; import com.google.common.collect.Lists; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; @@ -41,10 +40,9 @@ /** * */ -public class TestOrcTimezone3 { +public class TestOrcTimezone3 implements TestConf { Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" + File.separator + "tmp")); - Configuration conf; FileSystem fs; Path testFilePath; static TimeZone defaultTimeZone = TimeZone.getDefault(); @@ -55,7 +53,6 @@ private static Stream data() { @BeforeEach public void openFileSystem(TestInfo testInfo) throws Exception { - conf = new Configuration(); fs = FileSystem.getLocal(conf); testFilePath = new Path(workDir, "TestOrcTimezone3." + testInfo.getTestMethod().get().getName() + ".orc"); diff --git a/java/core/src/test/org/apache/orc/TestOrcTimezone4.java b/java/core/src/test/org/apache/orc/TestOrcTimezone4.java index 8c06e473ce..cb03e18210 100644 --- a/java/core/src/test/org/apache/orc/TestOrcTimezone4.java +++ b/java/core/src/test/org/apache/orc/TestOrcTimezone4.java @@ -18,7 +18,6 @@ package org.apache.orc; import com.google.common.collect.Lists; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; @@ -35,14 +34,14 @@ import java.util.TimeZone; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; /** * */ -public class TestOrcTimezone4 { +public class TestOrcTimezone4 implements TestConf { Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" + File.separator + "tmp")); - Configuration conf; FileSystem fs; Path testFilePath; SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS"); @@ -53,7 +52,6 @@ public TestOrcTimezone4() { @BeforeEach public void openFileSystem(TestInfo testInfo) throws Exception { - conf = new Configuration(); fs = FileSystem.getLocal(conf); testFilePath = new Path(workDir, "TestOrcTimezone4." + testInfo.getTestMethod().get().getName() + ".orc"); @@ -95,6 +93,7 @@ public void testTimestampWriter() throws Exception { times = (TimestampColumnVector) batch.cols[0]; int idx = 0; while (rows.nextBatch(batch)) { + assertTrue(times.isUTC()); for(int r=0; r < batch.size; ++r) { Timestamp timestamp = times.asScratchTimestamp(r); assertEquals(ts.get(idx++), formatter.format(timestamp)); diff --git a/java/core/src/test/org/apache/orc/TestOrcTimezonePPD.java b/java/core/src/test/org/apache/orc/TestOrcTimezonePPD.java index f21ef810c0..ea0af05af8 100644 --- a/java/core/src/test/org/apache/orc/TestOrcTimezonePPD.java +++ b/java/core/src/test/org/apache/orc/TestOrcTimezonePPD.java @@ -16,7 +16,6 @@ package org.apache.orc; import com.google.common.collect.Lists; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; @@ -54,12 +53,11 @@ /** * */ -public class TestOrcTimezonePPD { +public class TestOrcTimezonePPD implements TestConf { private static final Logger LOG = LoggerFactory.getLogger(TestOrcTimezonePPD.class); Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" + File.separator + "tmp")); - Configuration conf; FileSystem fs; Path testFilePath; static TimeZone defaultTimeZone = TimeZone.getDefault(); @@ -90,7 +88,6 @@ private static Stream data() { @BeforeEach public void openFileSystem(TestInfo testInfo) throws Exception { - conf = new Configuration(); fs = FileSystem.getLocal(conf); testFilePath = new Path(workDir, "TestOrcFile." + testInfo.getTestMethod().get().getName() + ".orc"); @@ -387,7 +384,7 @@ public void testTimestampAllNulls(String writerTimeZone, String readerTimeZone) PredicateLeaf pred = createPredicateLeaf( PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP, "x", Timestamp.valueOf("2007-08-01 00:00:00.0"), null); - assertEquals(SearchArgument.TruthValue.NULL, RecordReaderImpl.evaluatePredicate(colStats[1], pred, bf)); + assertEquals(SearchArgument.TruthValue.NO, RecordReaderImpl.evaluatePredicate(colStats[1], pred, bf)); pred = createPredicateLeaf(PredicateLeaf.Operator.IS_NULL, PredicateLeaf.Type.TIMESTAMP, "x", null, null); assertEquals(SearchArgument.TruthValue.YES, RecordReaderImpl.evaluatePredicate(colStats[1], pred, bf)); diff --git a/java/core/src/test/org/apache/orc/TestOrcWithLargeStripeStatistics.java b/java/core/src/test/org/apache/orc/TestOrcWithLargeStripeStatistics.java index 30b2604bf9..9f86f017e8 100644 --- a/java/core/src/test/org/apache/orc/TestOrcWithLargeStripeStatistics.java +++ b/java/core/src/test/org/apache/orc/TestOrcWithLargeStripeStatistics.java @@ -16,7 +16,6 @@ */ package org.apache.orc; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; @@ -41,7 +40,7 @@ *

*/ @Disabled("ORC-1361") -public class TestOrcWithLargeStripeStatistics { +public class TestOrcWithLargeStripeStatistics implements TestConf { @ParameterizedTest @EnumSource(value = OrcFile.Version.class, mode = EnumSource.Mode.EXCLUDE, names = "FUTURE") @@ -49,7 +48,7 @@ public void testGetStripeStatisticsNoProtocolBufferExceptions(OrcFile.Version ve throws Exception { // Use a size that exceeds the protobuf limit (e.g., 1GB) to trigger protobuf exception Path p = createOrcFile(1024L << 20, version); - try (Reader reader = OrcFile.createReader(p, OrcFile.readerOptions(new Configuration()))) { + try (Reader reader = OrcFile.createReader(p, OrcFile.readerOptions(conf))) { assertTrue(reader.getStripeStatistics().isEmpty()); } } @@ -75,7 +74,6 @@ private static Path createOrcFile(long metadataSize, OrcFile.Version version) th TestOrcWithLargeStripeStatistics.class.getSimpleName() + "_" + ROW_STRIPE_NUM + "_" + version + ".orc"); // Modify defaults to force one row per stripe. - Configuration conf = new Configuration(); conf.set(OrcConf.ROWS_BETWEEN_CHECKS.getAttribute(), "0"); TypeDescription schema = createTypeDescription(); OrcFile.WriterOptions writerOptions = diff --git a/java/core/src/test/org/apache/orc/TestProlepticConversions.java b/java/core/src/test/org/apache/orc/TestProlepticConversions.java index ff983b3c86..ae8201c60f 100644 --- a/java/core/src/test/org/apache/orc/TestProlepticConversions.java +++ b/java/core/src/test/org/apache/orc/TestProlepticConversions.java @@ -17,7 +17,6 @@ */ package org.apache.orc; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; @@ -56,7 +55,7 @@ * This class tests all of the combinations of reading and writing the hybrid * and proleptic calendars. */ -public class TestProlepticConversions { +public class TestProlepticConversions implements TestConf { private static Stream data() { return Stream.of( @@ -69,12 +68,10 @@ private static Stream data() { private Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" + File.separator + "tmp")); - private final Configuration conf; private final TimeZone UTC = TimeZone.getTimeZone("UTC"); private final GregorianCalendar PROLEPTIC = new GregorianCalendar(); private final GregorianCalendar HYBRID = new GregorianCalendar(); { - conf = new Configuration(); PROLEPTIC.setTimeZone(UTC); PROLEPTIC.setGregorianChange(new Date(Long.MIN_VALUE)); HYBRID.setTimeZone(UTC); diff --git a/java/core/src/test/org/apache/orc/TestReader.java b/java/core/src/test/org/apache/orc/TestReader.java index d4b648f5ed..f3c11d54a0 100644 --- a/java/core/src/test/org/apache/orc/TestReader.java +++ b/java/core/src/test/org/apache/orc/TestReader.java @@ -17,7 +17,6 @@ */ package org.apache.orc; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -30,16 +29,14 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertThrows; -public class TestReader { +public class TestReader implements TestConf { Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" + File.separator + "tmp")); - Configuration conf; FileSystem fs; Path testFilePath; @BeforeEach public void openFileSystem(TestInfo testInfo) throws Exception { - conf = new Configuration(); fs = FileSystem.getLocal(conf); testFilePath = new Path(workDir, TestReader.class.getSimpleName() + "." + testInfo.getTestMethod().get().getName() + ".orc"); diff --git a/java/core/src/test/org/apache/orc/TestRowFilteringComplexTypes.java b/java/core/src/test/org/apache/orc/TestRowFilteringComplexTypes.java index bebe3817ce..0f6b76e622 100644 --- a/java/core/src/test/org/apache/orc/TestRowFilteringComplexTypes.java +++ b/java/core/src/test/org/apache/orc/TestRowFilteringComplexTypes.java @@ -17,7 +17,6 @@ */ package org.apache.orc; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.Decimal64ColumnVector; @@ -39,11 +38,10 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; -public class TestRowFilteringComplexTypes { +public class TestRowFilteringComplexTypes implements TestConf { private Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" + File.separator + "tmp")); - private Configuration conf; private FileSystem fs; private Path testFilePath; @@ -51,7 +49,6 @@ public class TestRowFilteringComplexTypes { @BeforeEach public void openFileSystem(TestInfo testInfo) throws Exception { - conf = new Configuration(); OrcConf.READER_USE_SELECTED.setBoolean(conf, true); fs = FileSystem.getLocal(conf); testFilePath = new Path(workDir, diff --git a/java/core/src/test/org/apache/orc/TestRowFilteringComplexTypesNulls.java b/java/core/src/test/org/apache/orc/TestRowFilteringComplexTypesNulls.java index c45c94e166..248e6c88dd 100644 --- a/java/core/src/test/org/apache/orc/TestRowFilteringComplexTypesNulls.java +++ b/java/core/src/test/org/apache/orc/TestRowFilteringComplexTypesNulls.java @@ -18,7 +18,6 @@ package org.apache.orc; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.type.HiveDecimal; @@ -47,14 +46,13 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; -public class TestRowFilteringComplexTypesNulls { +public class TestRowFilteringComplexTypesNulls implements TestConf { private static final Logger LOG = LoggerFactory.getLogger(TestRowFilteringComplexTypesNulls.class); private static final Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" + File.separator + "tmp")); private static final Path filePath = new Path(workDir, "complex_null_file.orc"); - private static Configuration conf; private static FileSystem fs; private static final TypeDescription schema = TypeDescription.createStruct() @@ -75,7 +73,6 @@ public class TestRowFilteringComplexTypesNulls { @BeforeAll public static void setup() throws IOException { - conf = new Configuration(); fs = FileSystem.get(conf); LOG.info("Creating file {} with schema {}", filePath, schema); diff --git a/java/core/src/test/org/apache/orc/TestRowFilteringIOSkip.java b/java/core/src/test/org/apache/orc/TestRowFilteringIOSkip.java index d0b19a9c05..fd32a431d7 100644 --- a/java/core/src/test/org/apache/orc/TestRowFilteringIOSkip.java +++ b/java/core/src/test/org/apache/orc/TestRowFilteringIOSkip.java @@ -48,13 +48,12 @@ import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; -public class TestRowFilteringIOSkip { +public class TestRowFilteringIOSkip implements TestConf { private static final Logger LOG = LoggerFactory.getLogger(TestRowFilteringIOSkip.class); private static final Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" + File.separator + "tmp")); private static final Path filePath = new Path(workDir, "skip_file.orc"); - private static Configuration conf; private static FileSystem fs; private static final TypeDescription schema = TypeDescription.createStruct() @@ -71,7 +70,6 @@ public class TestRowFilteringIOSkip { @BeforeAll public static void setup() throws IOException { - conf = new Configuration(); fs = FileSystem.get(conf); LOG.info("Creating file {} with schema {}", filePath, schema); diff --git a/java/core/src/test/org/apache/orc/TestRowFilteringNoSkip.java b/java/core/src/test/org/apache/orc/TestRowFilteringNoSkip.java index 87c390e8a4..b4a677d869 100644 --- a/java/core/src/test/org/apache/orc/TestRowFilteringNoSkip.java +++ b/java/core/src/test/org/apache/orc/TestRowFilteringNoSkip.java @@ -17,7 +17,6 @@ */ package org.apache.orc; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; @@ -42,12 +41,11 @@ * As it turns out it is more expensive to skip non-selected rows rather that just decode all and propagate the * selected array. Skipping for these type breaks instruction pipelining and introduces more branch mispredictions. */ -public class TestRowFilteringNoSkip { +public class TestRowFilteringNoSkip implements TestConf { private Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" + File.separator + "tmp")); - private Configuration conf; private FileSystem fs; private Path testFilePath; @@ -55,7 +53,6 @@ public class TestRowFilteringNoSkip { @BeforeEach public void openFileSystem(TestInfo testInfo) throws Exception { - conf = new Configuration(); OrcConf.READER_USE_SELECTED.setBoolean(conf, true); fs = FileSystem.getLocal(conf); testFilePath = new Path(workDir, "TestRowFilteringNoSkip." + diff --git a/java/core/src/test/org/apache/orc/TestRowFilteringSkip.java b/java/core/src/test/org/apache/orc/TestRowFilteringSkip.java index dafbd35d36..ea4bc583c0 100644 --- a/java/core/src/test/org/apache/orc/TestRowFilteringSkip.java +++ b/java/core/src/test/org/apache/orc/TestRowFilteringSkip.java @@ -17,7 +17,6 @@ */ package org.apache.orc; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; @@ -53,12 +52,11 @@ * Types that are skipped at row-level include: Decimal, Decimal64, Double, Float, Char, VarChar, String, Boolean, Timestamp * For the remaining types that are not row-skipped see {@link TestRowFilteringNoSkip} */ -public class TestRowFilteringSkip { +public class TestRowFilteringSkip implements TestConf { private Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" + File.separator + "tmp")); - private Configuration conf; private FileSystem fs; private Path testFilePath; @@ -66,7 +64,6 @@ public class TestRowFilteringSkip { @BeforeEach public void openFileSystem(TestInfo testInfo) throws Exception { - conf = new Configuration(); OrcConf.READER_USE_SELECTED.setBoolean(conf, true); fs = FileSystem.getLocal(conf); testFilePath = new Path(workDir, "TestRowFilteringSkip." + diff --git a/java/core/src/test/org/apache/orc/TestSelectedVector.java b/java/core/src/test/org/apache/orc/TestSelectedVector.java index 3e2e4750f0..b1accd78a9 100644 --- a/java/core/src/test/org/apache/orc/TestSelectedVector.java +++ b/java/core/src/test/org/apache/orc/TestSelectedVector.java @@ -18,7 +18,6 @@ package org.apache.orc; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; @@ -49,17 +48,15 @@ import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertTrue; -public class TestSelectedVector { +public class TestSelectedVector implements TestConf { Path workDir = new Path(System.getProperty("test.tmp.dir")); - Configuration conf; FileSystem fs; Path testFilePath; Random random = new Random(); @BeforeEach public void openFileSystem(TestInfo testInfo) throws Exception { - conf = new Configuration(); conf.setInt(OrcConf.ROW_INDEX_STRIDE.getAttribute(), VectorizedRowBatch.DEFAULT_SIZE); fs = FileSystem.getLocal(conf); fs.setWorkingDirectory(workDir); diff --git a/java/core/src/test/org/apache/orc/TestStringDictionary.java b/java/core/src/test/org/apache/orc/TestStringDictionary.java index a7a1d714ca..9f3d4eb118 100644 --- a/java/core/src/test/org/apache/orc/TestStringDictionary.java +++ b/java/core/src/test/org/apache/orc/TestStringDictionary.java @@ -51,18 +51,16 @@ import static org.junit.jupiter.api.Assertions.assertTrue; -public class TestStringDictionary { +public class TestStringDictionary implements TestConf { private Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" + File.separator + "tmp")); - private Configuration conf; private FileSystem fs; private Path testFilePath; @BeforeEach public void openFileSystem(TestInfo testInfo) throws Exception { - conf = new Configuration(); fs = FileSystem.getLocal(conf); testFilePath = new Path(workDir, "TestStringDictionary." + testInfo.getTestMethod().get().getName() + ".orc"); diff --git a/java/core/src/test/org/apache/orc/TestTypeDescription.java b/java/core/src/test/org/apache/orc/TestTypeDescription.java index 7dba23a9f9..3f811803f9 100644 --- a/java/core/src/test/org/apache/orc/TestTypeDescription.java +++ b/java/core/src/test/org/apache/orc/TestTypeDescription.java @@ -17,7 +17,6 @@ */ package org.apache.orc; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.junit.jupiter.api.Test; @@ -33,7 +32,7 @@ import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.fail; -public class TestTypeDescription { +public class TestTypeDescription implements TestConf { @Test public void testJson() { TypeDescription bin = TypeDescription.createBinary(); @@ -369,7 +368,6 @@ public void testAttributes() throws IOException { // write a file with those attributes Path path = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" + File.separator + "tmp"), "attribute.orc"); - Configuration conf = new Configuration(); Writer writer = OrcFile.createWriter(path, OrcFile.writerOptions(conf).setSchema(schema).overwrite(true)); writer.close(); diff --git a/java/core/src/test/org/apache/orc/TestUnicode.java b/java/core/src/test/org/apache/orc/TestUnicode.java index 3706644315..c901ec72f9 100644 --- a/java/core/src/test/org/apache/orc/TestUnicode.java +++ b/java/core/src/test/org/apache/orc/TestUnicode.java @@ -18,7 +18,6 @@ package org.apache.orc; import org.apache.commons.lang3.StringUtils; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; @@ -36,11 +35,10 @@ import static org.junit.jupiter.api.Assertions.assertEquals; -public class TestUnicode { +public class TestUnicode implements TestConf { Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" + File.separator + "tmp")); - Configuration conf; FileSystem fs; Path testFilePath; @@ -66,7 +64,6 @@ private static Stream data() { @BeforeEach public void openFileSystem(TestInfo testInfo) throws Exception { - conf = new Configuration(); fs = FileSystem.getLocal(conf); testFilePath = new Path(workDir, "TestOrcFile." + testInfo.getTestMethod().get().getName() + ".orc"); diff --git a/java/core/src/test/org/apache/orc/TestUnrolledBitPack.java b/java/core/src/test/org/apache/orc/TestUnrolledBitPack.java index d30fc98df0..7735b59a08 100644 --- a/java/core/src/test/org/apache/orc/TestUnrolledBitPack.java +++ b/java/core/src/test/org/apache/orc/TestUnrolledBitPack.java @@ -20,7 +20,6 @@ import com.google.common.collect.Lists; import com.google.common.primitives.Longs; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; @@ -37,7 +36,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals; -public class TestUnrolledBitPack { +public class TestUnrolledBitPack implements TestConf { private static Stream data() { return Stream.of( @@ -57,13 +56,11 @@ private static Stream data() { Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" + File.separator + "tmp")); - Configuration conf; FileSystem fs; Path testFilePath; @BeforeEach public void openFileSystem(TestInfo testInfo) throws Exception { - conf = new Configuration(); fs = FileSystem.getLocal(conf); testFilePath = new Path(workDir, "TestOrcFile." + testInfo.getTestMethod().get().getName() + ".orc"); diff --git a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java index c24514f697..76681f4621 100644 --- a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java +++ b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java @@ -19,7 +19,6 @@ package org.apache.orc; import com.google.common.collect.Lists; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.type.HiveDecimal; @@ -102,7 +101,7 @@ /** * Tests for the vectorized reader and writer for ORC files. */ -public class TestVectorOrcFile { +public class TestVectorOrcFile implements TestConf { private static Stream data() { return Stream.of( @@ -193,13 +192,11 @@ private static ByteBuffer byteBuf(int... items) { Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" + File.separator + "tmp")); - Configuration conf; FileSystem fs; Path testFilePath; @BeforeEach public void openFileSystem(TestInfo testInfo) throws Exception { - conf = new Configuration(); fs = FileSystem.getLocal(conf); testFilePath = new Path(workDir, "TestVectorOrcFile." + testInfo.getTestMethod().get().getName().replaceFirst("\\[[0-9]+\\]", "") diff --git a/java/core/src/test/org/apache/orc/impl/TestBitPack.java b/java/core/src/test/org/apache/orc/impl/TestBitPack.java index 53ac1ce4b0..e2e1a67c83 100644 --- a/java/core/src/test/org/apache/orc/impl/TestBitPack.java +++ b/java/core/src/test/org/apache/orc/impl/TestBitPack.java @@ -18,10 +18,10 @@ package org.apache.orc.impl; import com.google.common.primitives.Longs; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.orc.impl.writer.StreamOptions; +import org.apache.orc.TestConf; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.TestInfo; @@ -35,20 +35,18 @@ import static org.junit.jupiter.api.Assertions.assertArrayEquals; import static org.junit.jupiter.api.Assertions.assertEquals; -public class TestBitPack { +public class TestBitPack implements TestConf { private static final int SIZE = 100; private static Random rand = new Random(100); Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" + File.separator + "tmp")); - Configuration conf; FileSystem fs; Path testFilePath; @BeforeEach public void openFileSystem(TestInfo testInfo) throws Exception { - conf = new Configuration(); fs = FileSystem.getLocal(conf); testFilePath = new Path(workDir, "TestOrcFile." + testInfo.getTestMethod().get().getName() + ".orc"); diff --git a/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java b/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java index 54d5ac143f..f16d042fdb 100644 --- a/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java +++ b/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java @@ -18,13 +18,13 @@ package org.apache.orc.impl; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.serde2.io.DateWritable; import org.apache.orc.DecimalColumnStatistics; import org.apache.orc.OrcFile; import org.apache.orc.OrcProto; import org.apache.orc.Reader; +import org.apache.orc.TestConf; import org.apache.orc.TimestampColumnStatistics; import org.apache.orc.TypeDescription; import org.junit.jupiter.api.Test; @@ -37,7 +37,7 @@ import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertTrue; -public class TestColumnStatisticsImpl { +public class TestColumnStatisticsImpl implements TestConf { @Test public void testUpdateDate() { @@ -78,7 +78,6 @@ public void testOldTimestamps() throws IOException { TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles")); Path exampleDir = new Path(System.getProperty("example.dir")); Path file = new Path(exampleDir, "TestOrcFile.testTimestamp.orc"); - Configuration conf = new Configuration(); Reader reader = OrcFile.createReader(file, OrcFile.readerOptions(conf)); TimestampColumnStatistics stats = (TimestampColumnStatistics) reader.getStatistics()[0]; diff --git a/java/core/src/test/org/apache/orc/impl/TestConvertTreeReaderFactory.java b/java/core/src/test/org/apache/orc/impl/TestConvertTreeReaderFactory.java index a90a285a65..53f94cbf7e 100644 --- a/java/core/src/test/org/apache/orc/impl/TestConvertTreeReaderFactory.java +++ b/java/core/src/test/org/apache/orc/impl/TestConvertTreeReaderFactory.java @@ -36,6 +36,7 @@ import org.apache.orc.OrcFile.WriterOptions; import org.apache.orc.Reader; import org.apache.orc.RecordReader; +import org.apache.orc.TestConf; import org.apache.orc.TestProlepticConversions; import org.apache.orc.TypeDescription; import org.apache.orc.Writer; @@ -58,12 +59,11 @@ import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.Mockito.mock; -public class TestConvertTreeReaderFactory { +public class TestConvertTreeReaderFactory implements TestConf { private Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" + File.separator + "tmp")); - private Configuration conf; private FileSystem fs; private Path testFilePath; private int LARGE_BATCH_SIZE; @@ -74,7 +74,6 @@ public class TestConvertTreeReaderFactory { public void setupPath(TestInfo testInfo) throws Exception { // Default CV length is 1024 this.LARGE_BATCH_SIZE = 1030; - this.conf = new Configuration(); this.fs = FileSystem.getLocal(conf); this.testFilePath = new Path(workDir, TestWriterImpl.class.getSimpleName() + testInfo.getTestMethod().get().getName().replaceFirst("\\[[0-9]+]", "") + @@ -85,7 +84,6 @@ public void setupPath(TestInfo testInfo) throws Exception { public TExpectedColumnVector createORCFileWithLargeArray( TypeDescription schema, Class expectedColumnType, boolean useDecimal64) throws IOException, ParseException { - conf = new Configuration(); fs = FileSystem.getLocal(conf); fs.setWorkingDirectory(workDir); Writer w = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).setSchema(schema)); @@ -115,7 +113,6 @@ public TExpectedColumnVector create public TExpectedColumnVector createORCFileWithBatchesOfIncreasingSizeInDifferentStripes( TypeDescription schema, Class typeClass, boolean useDecimal64) throws IOException, ParseException { - conf = new Configuration(); fs = FileSystem.getLocal(conf); fs.setWorkingDirectory(workDir); WriterOptions options = OrcFile.writerOptions(conf); @@ -178,8 +175,6 @@ public TExpectedColumnVector readOR options.schema(schema); String expected = options.toString(); - Configuration conf = new Configuration(); - Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf)); RecordReader rows = reader.rows(options); VectorizedRowBatch batch = schema.createRowBatchV2(); @@ -200,8 +195,6 @@ public void readORCFileIncreasingBatchSize(String typeString, Class expectedC options.schema(schema); String expected = options.toString(); - Configuration conf = new Configuration(); - Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf)); RecordReader rows = reader.rows(options); VectorizedRowBatch batch = schema.createRowBatchV2(); @@ -693,8 +686,6 @@ private void readDecimalInNullStripe(String typeString, Class expectedColumnT options.schema(schema); String expected = options.toString(); - Configuration conf = new Configuration(); - Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf)); RecordReader rows = reader.rows(options); VectorizedRowBatch batch = schema.createRowBatch(); @@ -707,7 +698,7 @@ private void readDecimalInNullStripe(String typeString, Class expectedColumnT assertTrue(batch.cols[0].isRepeating); StringBuilder sb = new StringBuilder(); batch.cols[0].stringifyValue(sb, 1023); - assertEquals(sb.toString(), expectedResult[0]); + assertEquals(expectedResult[0], sb.toString()); rows.nextBatch(batch); assertEquals(1024, batch.size); @@ -717,17 +708,17 @@ private void readDecimalInNullStripe(String typeString, Class expectedColumnT assertFalse(batch.cols[0].isRepeating); StringBuilder sb2 = new StringBuilder(); batch.cols[0].stringifyValue(sb2, 1023); - assertEquals(sb2.toString(), expectedResult[1]); + assertEquals(expectedResult[1], sb2.toString()); rows.nextBatch(batch); assertEquals(1024, batch.size); assertEquals(expected, options.toString()); assertEquals(batch.cols.length, 1); assertEquals(batch.cols[0].getClass(), expectedColumnType); - assertTrue(batch.cols[0].isRepeating); + assertFalse(batch.cols[0].isRepeating); StringBuilder sb3 = new StringBuilder(); batch.cols[0].stringifyValue(sb3, 1023); - assertEquals(sb3.toString(), expectedResult[2]); + assertEquals(expectedResult[2], sb3.toString()); } private void testDecimalConvertToLongInNullStripe() throws Exception { diff --git a/java/core/src/test/org/apache/orc/impl/TestCryptoUtils.java b/java/core/src/test/org/apache/orc/impl/TestCryptoUtils.java index 73d7231e61..9c2d891109 100644 --- a/java/core/src/test/org/apache/orc/impl/TestCryptoUtils.java +++ b/java/core/src/test/org/apache/orc/impl/TestCryptoUtils.java @@ -18,12 +18,12 @@ package org.apache.orc.impl; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.BytesWritable; import org.apache.orc.EncryptionAlgorithm; import org.apache.orc.InMemoryKeystore; import org.apache.orc.OrcConf; import org.apache.orc.OrcProto; +import org.apache.orc.TestConf; import org.junit.jupiter.api.Test; import java.io.IOException; @@ -35,7 +35,7 @@ import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertTrue; -public class TestCryptoUtils { +public class TestCryptoUtils implements TestConf { @Test public void testCreateStreamIv() throws Exception { @@ -56,7 +56,6 @@ public void testCreateStreamIv() throws Exception { @Test public void testMemoryKeyProvider() throws IOException { - Configuration conf = new Configuration(); OrcConf.KEY_PROVIDER.setString(conf, "memory"); // Hard code the random so that we know the bytes that will come out. InMemoryKeystore provider = @@ -91,7 +90,6 @@ public void testMemoryKeyProvider() throws IOException { @Test public void testInvalidKeyProvider() throws IOException { - Configuration conf = new Configuration(); OrcConf.KEY_PROVIDER.setString(conf, ""); assertNull(CryptoUtils.getKeyProvider(conf, new Random())); } diff --git a/java/core/src/test/org/apache/orc/impl/TestEncryption.java b/java/core/src/test/org/apache/orc/impl/TestEncryption.java index 64fcbcf921..3ab2bb8b77 100644 --- a/java/core/src/test/org/apache/orc/impl/TestEncryption.java +++ b/java/core/src/test/org/apache/orc/impl/TestEncryption.java @@ -17,7 +17,6 @@ */ package org.apache.orc.impl; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; @@ -32,6 +31,7 @@ import org.apache.orc.OrcFile; import org.apache.orc.Reader; import org.apache.orc.RecordReader; +import org.apache.orc.TestConf; import org.apache.orc.TypeDescription; import org.apache.orc.Writer; import org.junit.jupiter.api.AfterEach; @@ -43,10 +43,9 @@ import static org.junit.jupiter.api.Assertions.assertEquals; -public class TestEncryption { +public class TestEncryption implements TestConf { Path workDir = new Path(System.getProperty("test.tmp.dir")); - Configuration conf; FileSystem fs; Path testFilePath; TypeDescription schema; @@ -56,11 +55,9 @@ public class TestEncryption { @BeforeEach public void openFileSystem() throws Exception { - conf = new Configuration(); conf.setInt(OrcConf.ROW_INDEX_STRIDE.getAttribute(), VectorizedRowBatch.DEFAULT_SIZE); fs = FileSystem.getLocal(conf); - fs.setWorkingDirectory(workDir); - testFilePath = new Path("testWriterImpl.orc"); + testFilePath = new Path(workDir, "TestEncryption.orc"); fs.create(testFilePath, true); schema = TypeDescription.fromString("struct"); byte[] kmsKey = "secret123".getBytes(StandardCharsets.UTF_8); diff --git a/java/core/src/test/org/apache/orc/impl/TestMemoryManager.java b/java/core/src/test/org/apache/orc/impl/TestMemoryManager.java index 247f615a47..7f1f8359d7 100644 --- a/java/core/src/test/org/apache/orc/impl/TestMemoryManager.java +++ b/java/core/src/test/org/apache/orc/impl/TestMemoryManager.java @@ -17,9 +17,9 @@ */ package org.apache.orc.impl; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.orc.MemoryManager; +import org.apache.orc.TestConf; import org.junit.jupiter.api.Test; import org.mockito.Mockito; @@ -32,7 +32,7 @@ /** * Test the ORC memory manager. */ -public class TestMemoryManager { +public class TestMemoryManager implements TestConf { private static final double ERROR = 0.000001; private static class NullCallback implements MemoryManagerImpl.Callback { @@ -43,7 +43,6 @@ public boolean checkMemory(double newScale) { @Test public void testBasics() throws Exception { - Configuration conf = new Configuration(); MemoryManagerImpl mgr = new MemoryManagerImpl(conf); NullCallback callback = new NullCallback(); long poolSize = mgr.getTotalMemoryPool(); @@ -71,7 +70,6 @@ public void testBasics() throws Exception { @Test public void testConfig() throws Exception { - Configuration conf = new Configuration(); conf.set("hive.exec.orc.memory.pool", "0.9"); MemoryManagerImpl mgr = new MemoryManagerImpl(conf); long mem = @@ -84,7 +82,6 @@ public void testConfig() throws Exception { @Test public void testCallback() throws Exception { - Configuration conf = new Configuration(); MemoryManagerImpl mgr = new MemoryManagerImpl(conf); long pool = mgr.getTotalMemoryPool(); MemoryManager.Callback[] calls = new MemoryManager.Callback[20]; diff --git a/java/core/src/test/org/apache/orc/impl/TestOrcLargeStripe.java b/java/core/src/test/org/apache/orc/impl/TestOrcLargeStripe.java index 54463a0797..22ae8226fa 100644 --- a/java/core/src/test/org/apache/orc/impl/TestOrcLargeStripe.java +++ b/java/core/src/test/org/apache/orc/impl/TestOrcLargeStripe.java @@ -15,7 +15,6 @@ */ package org.apache.orc.impl; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -26,6 +25,7 @@ import org.apache.orc.OrcFile; import org.apache.orc.Reader; import org.apache.orc.RecordReader; +import org.apache.orc.TestConf; import org.apache.orc.TypeDescription; import org.apache.orc.Writer; import org.junit.jupiter.api.BeforeEach; @@ -56,18 +56,16 @@ import static org.mockito.Mockito.when; @ExtendWith(MockitoExtension.class) -public class TestOrcLargeStripe { +public class TestOrcLargeStripe implements TestConf { private Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" + File.separator + "tmp")); - Configuration conf; FileSystem fs; private Path testFilePath; @BeforeEach public void openFileSystem(TestInfo testInfo) throws Exception { - conf = new Configuration(); fs = FileSystem.getLocal(conf); testFilePath = new Path(workDir, "TestOrcFile." + testInfo.getTestMethod().get().getName() + ".orc"); @@ -136,7 +134,6 @@ public void testEmpty() throws Exception { @Test public void testConfigMaxChunkLimit() throws IOException { - Configuration conf = new Configuration(); FileSystem fs = FileSystem.getLocal(conf); TypeDescription schema = TypeDescription.createTimestamp(); fs.delete(testFilePath, false); @@ -151,7 +148,6 @@ public void testConfigMaxChunkLimit() throws IOException { assertTrue(recordReader instanceof RecordReaderImpl); assertEquals(Integer.MAX_VALUE - 1024, ((RecordReaderImpl) recordReader).getMaxDiskRangeChunkLimit()); - conf = new Configuration(); conf.setInt(OrcConf.ORC_MAX_DISK_RANGE_CHUNK_LIMIT.getHiveConfName(), 1000); opts = OrcFile.readerOptions(conf); reader = OrcFile.createReader(testFilePath, opts); diff --git a/java/core/src/test/org/apache/orc/impl/TestPhysicalFsWriter.java b/java/core/src/test/org/apache/orc/impl/TestPhysicalFsWriter.java index 9feac31047..62fcc80b31 100644 --- a/java/core/src/test/org/apache/orc/impl/TestPhysicalFsWriter.java +++ b/java/core/src/test/org/apache/orc/impl/TestPhysicalFsWriter.java @@ -26,7 +26,9 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.util.Progressable; +import org.apache.orc.CompressionCodec; import org.apache.orc.CompressionKind; +import org.apache.orc.OrcConf; import org.apache.orc.OrcFile; import org.apache.orc.OrcProto; import org.apache.orc.PhysicalWriter; @@ -330,4 +332,23 @@ public void testShortBlock() throws IOException { assertEquals(62 * 1024, dirEntry.getDataLength()); assertEquals(endOfStripe, shim.lastShortBlock); } + + @Test + public void testZstdCodec() throws IOException { + CompressionCodec zstdCodec = OrcCodecPool.getCodec(CompressionKind.ZSTD); + int originalHashCode = zstdCodec.getDefaultOptions().hashCode(); + + Configuration conf = new Configuration(); + conf.setInt(OrcConf.COMPRESSION_ZSTD_LEVEL.getAttribute(), 9); + MockHadoopShim shim = new MockHadoopShim(); + TypeDescription schema = TypeDescription.fromString("int"); + OrcFile.WriterOptions opts = + OrcFile.writerOptions(conf) + .compress(CompressionKind.ZSTD) + .setSchema(schema) + .setShims(shim); + MemoryFileSystem fs = new MemoryFileSystem(); + PhysicalFsWriter writer = new PhysicalFsWriter(fs, new Path("test1.orc"), opts); + assertEquals(originalHashCode, zstdCodec.getDefaultOptions().hashCode()); + } } diff --git a/java/core/src/test/org/apache/orc/impl/TestPredicatePushDownBounds.java b/java/core/src/test/org/apache/orc/impl/TestPredicatePushDownBounds.java index c2799ff901..aec865201c 100644 --- a/java/core/src/test/org/apache/orc/impl/TestPredicatePushDownBounds.java +++ b/java/core/src/test/org/apache/orc/impl/TestPredicatePushDownBounds.java @@ -17,7 +17,6 @@ */ package org.apache.orc.impl; -import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf; import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory; @@ -54,26 +53,20 @@ public void testCornerCases() { BloomFilter bf = new BloomFilter(100); // FFF... to PPP... for (int i = 70; i <= 80; i++) { - final String inputString = StringUtils - .repeat(Character.toString((char) i), stringLength); + final String inputString = Character.toString((char) i).repeat(stringLength); bf.addString(inputString); } - final String longStringF = StringUtils - .repeat(Character.toString('F'), stringLength); - final String longStringP = StringUtils - .repeat(Character.toString('P'), stringLength); + final String longStringF = Character.toString('F').repeat(stringLength); + final String longStringP = Character.toString('P').repeat(stringLength); /* String that matches the upperbound value after truncation */ - final String upperboundString = - StringUtils.repeat(Character.toString('P'), 1023) + "Q"; + final String upperboundString = Character.toString('P').repeat(1023) + "Q"; /* String that matches the lower value after truncation */ - final String lowerboundString = StringUtils - .repeat(Character.toString('F'), 1024); + final String lowerboundString = Character.toString('F').repeat(1024); - final String shortStringF = StringUtils.repeat(Character.toString('F'), 50); - final String shortStringP = - StringUtils.repeat(Character.toString('P'), 50) + "Q"; + final String shortStringF = Character.toString('F').repeat(50); + final String shortStringP = Character.toString('P').repeat(50) + "Q"; /* Test for a case EQUALS where only upperbound is set */ final PredicateLeaf predicateUpperBoundEquals = TestRecordReaderImpl @@ -165,17 +158,13 @@ public void testNormalCase() throws Exception { BloomFilter bf = new BloomFilter(100); // FFF... to PPP... for (int i = 70; i <= 80; i++) { - final String inputString = StringUtils - .repeat(Character.toString((char) i), bfStringLength); + final String inputString = Character.toString((char) i).repeat(bfStringLength); bf.addString(inputString); } - final String longStringF = StringUtils - .repeat(Character.toString('F'), stringLength); - final String longStringP = StringUtils - .repeat(Character.toString('P'), stringLength); - final String predicateString = StringUtils - .repeat(Character.toString('I'), 50); + final String longStringF = Character.toString('F').repeat(stringLength); + final String longStringP = Character.toString('P').repeat(stringLength); + final String predicateString = Character.toString('I').repeat(50); /* Test for a case where only upperbound is set */ @@ -215,26 +204,20 @@ public void testIN() throws Exception { final BloomFilter bf = new BloomFilter(100); // FFF... to PPP... for (int i = 70; i <= 80; i++) { - final String inputString = StringUtils - .repeat(Character.toString((char) i), stringLength); + final String inputString = Character.toString((char) i).repeat(stringLength); bf.addString(inputString); } - final String longStringF = StringUtils - .repeat(Character.toString('F'), stringLength); - final String longStringP = StringUtils - .repeat(Character.toString('P'), stringLength); + final String longStringF = Character.toString('F').repeat(stringLength); + final String longStringP = Character.toString('P').repeat(stringLength); /* String that matches the upperbound value after truncation */ - final String upperboundString = - StringUtils.repeat(Character.toString('P'), 1023) + "Q"; + final String upperboundString = Character.toString('P').repeat(1023) + "Q"; /* String that matches the lower value after truncation */ - final String lowerboundString = StringUtils - .repeat(Character.toString('F'), 1024); + final String lowerboundString = Character.toString('F').repeat(1024); - final String shortStringF = StringUtils.repeat(Character.toString('F'), 50); - final String shortStringP = - StringUtils.repeat(Character.toString('P'), 50) + "Q"; + final String shortStringF = Character.toString('F').repeat(50); + final String shortStringP = Character.toString('P').repeat(50) + "Q"; final List args = new ArrayList(); args.add(upperboundString); diff --git a/java/core/src/test/org/apache/orc/impl/TestReaderImpl.java b/java/core/src/test/org/apache/orc/impl/TestReaderImpl.java index e343b8f428..003ae22a7b 100644 --- a/java/core/src/test/org/apache/orc/impl/TestReaderImpl.java +++ b/java/core/src/test/org/apache/orc/impl/TestReaderImpl.java @@ -39,6 +39,7 @@ import org.apache.orc.Reader; import org.apache.orc.RecordReader; import org.apache.orc.StripeStatistics; +import org.apache.orc.TestConf; import org.apache.orc.TestVectorOrcFile; import org.apache.orc.TypeDescription; import org.junit.jupiter.api.BeforeEach; @@ -60,7 +61,7 @@ import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; -public class TestReaderImpl { +public class TestReaderImpl implements TestConf { private Path workDir = new Path(System.getProperty("example.dir", "../../examples/")); @@ -106,7 +107,6 @@ public void testEnsureOrcFooterCorrectORCFooter() throws IOException { public void testOptionSafety() throws IOException { Reader.Options options = new Reader.Options(); String expected = options.toString(); - Configuration conf = new Configuration(); Path path = new Path(TestVectorOrcFile.getFileFromClasspath ("orc-file-11-format.orc")); try (Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf)); @@ -310,7 +310,6 @@ public FileStatus getFileStatus(Path path) { @Test public void testClosingRowsFirst() throws Exception { - Configuration conf = new Configuration(); MockFileSystem fs = new MockFileSystem(conf); Reader reader = OrcFile.createReader(new Path("/foo"), OrcFile.readerOptions(conf).filesystem(fs)); @@ -329,7 +328,6 @@ public void testClosingRowsFirst() throws Exception { @Test public void testClosingReaderFirst() throws Exception { - Configuration conf = new Configuration(); MockFileSystem fs = new MockFileSystem(conf); Reader reader = OrcFile.createReader(new Path("/foo"), OrcFile.readerOptions(conf).filesystem(fs)); @@ -344,7 +342,6 @@ public void testClosingReaderFirst() throws Exception { @Test public void testClosingMultiple() throws Exception { - Configuration conf = new Configuration(); MockFileSystem fs = new MockFileSystem(conf); Reader reader = OrcFile.createReader(new Path("/foo"), OrcFile.readerOptions(conf).filesystem(fs)); @@ -359,7 +356,6 @@ public void testClosingMultiple() throws Exception { @Test public void testOrcTailStripeStats() throws Exception { - Configuration conf = new Configuration(); Path path = new Path(workDir, "orc_split_elim_new.orc"); FileSystem fs = path.getFileSystem(conf); try (ReaderImpl reader = (ReaderImpl) OrcFile.createReader(path, @@ -398,7 +394,6 @@ public void testOrcTailStripeStats() throws Exception { @Test public void testGetRawDataSizeFromColIndices() throws Exception { - Configuration conf = new Configuration(); Path path = new Path(workDir, "orc_split_elim_new.orc"); FileSystem fs = path.getFileSystem(conf); try (ReaderImpl reader = (ReaderImpl) OrcFile.createReader(path, @@ -420,7 +415,6 @@ public void testGetRawDataSizeFromColIndices() throws Exception { private void CheckFileWithSargs(String fileName, String softwareVersion) throws IOException { - Configuration conf = new Configuration(); Path path = new Path(workDir, fileName); FileSystem fs = path.getFileSystem(conf); try (ReaderImpl reader = (ReaderImpl) OrcFile.createReader(path, @@ -450,7 +444,6 @@ public void testSkipBadBloomFilters() throws IOException { @Test public void testReadDecimalV2File() throws IOException { - Configuration conf = new Configuration(); Path path = new Path(workDir, "decimal64_v2_cplusplus.orc"); FileSystem fs = path.getFileSystem(conf); try (ReaderImpl reader = (ReaderImpl) OrcFile.createReader(path, @@ -489,7 +482,6 @@ public void testReadDecimalV2File() throws IOException { @Test public void testExtractFileTailIndexOutOfBoundsException() throws Exception { - Configuration conf = new Configuration(); Path path = new Path(workDir, "demo-11-none.orc"); FileSystem fs = path.getFileSystem(conf); FileStatus fileStatus = fs.getFileStatus(path); @@ -508,7 +500,6 @@ public void testExtractFileTailIndexOutOfBoundsException() throws Exception { @Test public void testWithoutCompressionBlockSize() throws IOException { - Configuration conf = new Configuration(); Path path = new Path(workDir, "TestOrcFile.testWithoutCompressionBlockSize.orc"); FileSystem fs = path.getFileSystem(conf); try (ReaderImpl reader = (ReaderImpl) OrcFile.createReader(path, @@ -530,7 +521,6 @@ public void testWithoutCompressionBlockSize() throws IOException { @Test public void testSargSkipPickupGroupWithoutIndex() throws IOException { - Configuration conf = new Configuration(); // We use ORC files in two languages to test, the previous Java version could not work // well when orc.row.index.stride > 0 and orc.create.index=false, now it can skip these row groups. Path[] paths = new Path[] { diff --git a/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java b/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java index f0124715b8..f785e6e58e 100644 --- a/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java +++ b/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java @@ -28,6 +28,7 @@ import org.apache.hadoop.hive.common.io.DiskRangeList; import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; @@ -48,6 +49,7 @@ import org.apache.orc.OrcProto; import org.apache.orc.Reader; import org.apache.orc.RecordReader; +import org.apache.orc.TestConf; import org.apache.orc.TestVectorOrcFile; import org.apache.orc.TypeDescription; import org.apache.orc.Writer; @@ -102,7 +104,7 @@ import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; -public class TestRecordReaderImpl { +public class TestRecordReaderImpl implements TestConf { // This is a work around until we update storage-api to allow ChronoLocalDate in // predicates. @@ -113,7 +115,6 @@ static Date toDate(ChronoLocalDate date) { @Test public void testFindColumn() throws Exception { - Configuration conf = new Configuration(); TypeDescription file = TypeDescription.fromString("struct"); TypeDescription reader = TypeDescription.fromString("struct"); SchemaEvolution evo = new SchemaEvolution(file, reader, new Reader.Options(conf)); @@ -126,7 +127,6 @@ public void testFindColumn() throws Exception { @Test public void testFindColumnCaseInsensitively() throws Exception { - Configuration conf = new Configuration(); TypeDescription file = TypeDescription.fromString("struct"); TypeDescription reader = TypeDescription.fromString("struct"); conf.setBoolean("orc.schema.evolution.case.sensitive", false); @@ -136,8 +136,6 @@ public void testFindColumnCaseInsensitively() throws Exception { @Test public void testForcePositionalEvolution() throws Exception { - Configuration conf = new Configuration(); - Path oldFilePath = new Path(TestVectorOrcFile.getFileFromClasspath("orc-file-11-format.orc")); Reader reader = OrcFile.createReader(oldFilePath, OrcFile.readerOptions(conf).filesystem(FileSystem.getLocal(conf))); @@ -263,7 +261,6 @@ public boolean seekToNewSource(long position) throws IOException { @Test public void testMaxLengthToReader() throws Exception { - Configuration conf = new Configuration(); OrcProto.Type rowType = OrcProto.Type.newBuilder() .setKind(OrcProto.Type.Kind.STRUCT).build(); OrcProto.Footer footer = OrcProto.Footer.newBuilder() @@ -1975,7 +1972,6 @@ public void testCloseWithException() throws Exception { "target" + File.separator + "test" + File.separator + "tmp")); private void closeMockedRecordReader(DataReader mockedDataReader) throws IOException { - Configuration conf = new Configuration(); Path path = new Path(workDir, "empty.orc"); FileSystem.get(conf).delete(path, true); Writer writer = OrcFile.createWriter(path, OrcFile.writerOptions(conf) @@ -2252,7 +2248,6 @@ static OrcProto.RowIndexEntry createIndexEntry(Long min, Long max) { @Test public void testPickRowGroups() throws Exception { - Configuration conf = new Configuration(); TypeDescription schema = TypeDescription.fromString("struct"); SchemaEvolution evolution = new SchemaEvolution(schema, schema, new Reader.Options(conf)); @@ -2301,7 +2296,6 @@ public void testPickRowGroups() throws Exception { @Test public void testPickRowGroupsError() throws Exception { - Configuration conf = new Configuration(); TypeDescription schema = TypeDescription.fromString("struct"); SchemaEvolution evolution = new SchemaEvolution(schema, schema, new Reader.Options(conf)); @@ -2398,7 +2392,6 @@ public void testSkipDataReaderOpen() throws Exception { when(mockedDataReader.clone()).thenReturn(mockedDataReader); doNothing().when(mockedDataReader).close(); - Configuration conf = new Configuration(); Path path = new Path(workDir, "empty.orc"); FileSystem.get(conf).delete(path, true); OrcFile.WriterOptions options = OrcFile.writerOptions(conf).setSchema(TypeDescription.createLong()); @@ -2413,7 +2406,6 @@ public void testSkipDataReaderOpen() throws Exception { @Test public void testCloseAtConstructorException() throws Exception { - Configuration conf = new Configuration(); Path path = new Path(workDir, "oneRow.orc"); FileSystem.get(conf).delete(path, true); @@ -2444,7 +2436,6 @@ public void testCloseAtConstructorException() throws Exception { @Test public void testSargApplier() throws Exception { - Configuration conf = new Configuration(); TypeDescription schema = TypeDescription.createLong(); SearchArgument sarg = SearchArgumentFactory.newBuilder().build(); SchemaEvolution evo = new SchemaEvolution(schema, schema, new Reader.Options(conf)); @@ -2480,7 +2471,6 @@ public void testWithoutStatistics() { @Test public void testStatisticsWithNoWrites() throws Exception { Path testFilePath = new Path(workDir, "rowIndexStrideNegative.orc"); - Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); fs.delete(testFilePath, true); @@ -2536,7 +2526,6 @@ public void testDoubleColumnWithoutDoubleStatistics() throws Exception { Path filePath = new Path(ClassLoader.getSystemResource("orc-file-no-double-statistic.orc") .getPath()); - Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); Reader reader = OrcFile.createReader(filePath, @@ -2626,7 +2615,6 @@ public void testRgEndOffset() throws IOException { } private void testSmallCompressionSizeOrc(int compressionSize) throws IOException { - Configuration conf = new Configuration(); Path path = new Path(workDir, "smallCompressionSize.orc"); FileSystem.get(conf).delete(path, true); @@ -2672,7 +2660,6 @@ private void testSmallCompressionSizeOrc(int compressionSize) throws IOException @Test public void testRowIndexStrideNegativeFilter() throws Exception { Path testFilePath = new Path(workDir, "rowIndexStrideNegative.orc"); - Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); fs.delete(testFilePath, true); @@ -2715,7 +2702,6 @@ public void testRowIndexStrideNegativeFilter() throws Exception { @Test public void testHadoopVectoredIO() throws Exception { - Configuration conf = new Configuration(); Path filePath = new Path(TestVectorOrcFile.getFileFromClasspath("orc-file-11-format.orc")); FileSystem localFileSystem = FileSystem.getLocal(conf); @@ -2732,4 +2718,70 @@ public void testHadoopVectoredIO() throws Exception { verify(spyFSDataInputStream, atLeastOnce()).readVectored(any(), any()); } + + @Test + public void testDecimalIsRepeatingFlag() throws IOException { + FileSystem fs = FileSystem.get(conf); + Path testFilePath = new Path(workDir, "testDecimalIsRepeatingFlag.orc"); + fs.delete(testFilePath, true); + + Configuration decimalConf = new Configuration(conf); + decimalConf.set(OrcConf.STRIPE_ROW_COUNT.getAttribute(), "1024"); + decimalConf.set(OrcConf.ROWS_BETWEEN_CHECKS.getAttribute(), "1"); + String typeStr = "decimal(20,10)"; + TypeDescription schema = TypeDescription.fromString("struct"); + Writer w = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(decimalConf).setSchema(schema)); + + VectorizedRowBatch b = schema.createRowBatch(); + DecimalColumnVector f1 = (DecimalColumnVector) b.cols[0]; + for (int i = 0; i < 1024; i++) { + f1.set(i, HiveDecimal.create("-119.4594594595")); + } + b.size = 1024; + w.addRowBatch(b); + + b.reset(); + for (int i = 0; i < 1024; i++) { + f1.set(i, HiveDecimal.create("9318.4351351351")); + } + b.size = 1024; + w.addRowBatch(b); + + b.reset(); + for (int i = 0; i < 1024; i++) { + f1.set(i, HiveDecimal.create("-4298.1513513514")); + } + b.size = 1024; + w.addRowBatch(b); + + b.reset(); + w.close(); + + Reader.Options options = new Reader.Options(); + try (Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf)); + RecordReader rows = reader.rows(options)) { + VectorizedRowBatch batch = schema.createRowBatch(); + + rows.nextBatch(batch); + assertEquals(1024, batch.size); + assertFalse(batch.cols[0].isRepeating); + for (HiveDecimalWritable hiveDecimalWritable : ((DecimalColumnVector) batch.cols[0]).vector) { + assertEquals(HiveDecimal.create("-119.4594594595"), hiveDecimalWritable.getHiveDecimal()); + } + + rows.nextBatch(batch); + assertEquals(1024, batch.size); + assertFalse(batch.cols[0].isRepeating); + for (HiveDecimalWritable hiveDecimalWritable : ((DecimalColumnVector) batch.cols[0]).vector) { + assertEquals(HiveDecimal.create("9318.4351351351"), hiveDecimalWritable.getHiveDecimal()); + } + + rows.nextBatch(batch); + assertEquals(1024, batch.size); + assertFalse(batch.cols[0].isRepeating); + for (HiveDecimalWritable hiveDecimalWritable : ((DecimalColumnVector) batch.cols[0]).vector) { + assertEquals(HiveDecimal.create("-4298.1513513514"), hiveDecimalWritable.getHiveDecimal()); + } + } + } } diff --git a/java/core/src/test/org/apache/orc/impl/TestSchemaEvolution.java b/java/core/src/test/org/apache/orc/impl/TestSchemaEvolution.java index 3a82fb5f29..fde63021f9 100644 --- a/java/core/src/test/org/apache/orc/impl/TestSchemaEvolution.java +++ b/java/core/src/test/org/apache/orc/impl/TestSchemaEvolution.java @@ -32,6 +32,7 @@ import org.apache.orc.OrcProto; import org.apache.orc.Reader; import org.apache.orc.RecordReader; +import org.apache.orc.TestConf; import org.apache.orc.TypeDescription; import org.apache.orc.Writer; import org.apache.orc.impl.reader.ReaderEncryption; @@ -62,9 +63,8 @@ import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; -public class TestSchemaEvolution { +public class TestSchemaEvolution implements TestConf { - Configuration conf; Reader.Options options; Path testFilePath; FileSystem fs; @@ -73,7 +73,6 @@ public class TestSchemaEvolution { @BeforeEach public void setup(TestInfo testInfo) throws Exception { - conf = new Configuration(); options = new Reader.Options(conf); fs = FileSystem.getLocal(conf); testFilePath = new Path(workDir, "TestSchemaEvolution." + @@ -2357,6 +2356,9 @@ public void testEvolutionToTimestamp() throws Exception { final ZoneId WRITER_ZONE = ZoneId.of("America/New_York"); final ZoneId READER_ZONE = ZoneId.of("Australia/Sydney"); + final String EXPECT_LOCAL = "expected %s in local time zone"; + final String EXPECT_UTC = "expected %s in UTC time zone"; + final TimeZone oldDefault = TimeZone.getDefault(); final ZoneId UTC = ZoneId.of("UTC"); @@ -2421,50 +2423,62 @@ public void testEvolutionToTimestamp() throws Exception { assertEquals(expected1.replace(".1 ", " "), timestampToString(l1.time[current], l1.nanos[current], READER_ZONE), msg); + assertFalse(l1.isUTC(), EXPECT_LOCAL.formatted("l1")); assertEquals(expected2.replace(".1 ", " "), timestampToString(l2.time[current], l2.nanos[current], WRITER_ZONE), msg); + assertTrue(l2.isUTC(), EXPECT_UTC.formatted("l2")); assertEquals(longTimestampToString(((r % 128) - offset), READER_ZONE), timestampToString(t1.time[current], t1.nanos[current], READER_ZONE), msg); + assertFalse(t1.isUTC(), EXPECT_LOCAL.formatted("t1")); assertEquals(longTimestampToString((r % 128), WRITER_ZONE), timestampToString(t2.time[current], t2.nanos[current], WRITER_ZONE), msg); + assertTrue(t2.isUTC(), EXPECT_UTC.formatted("t2")); assertEquals(expected1, timestampToString(d1.time[current], d1.nanos[current], READER_ZONE), msg); + assertFalse(d1.isUTC(), EXPECT_LOCAL.formatted("d1")); assertEquals(expected2, timestampToString(d2.time[current], d2.nanos[current], WRITER_ZONE), msg); + assertTrue(d2.isUTC(), EXPECT_UTC.formatted("d2")); assertEquals(expected1, timestampToString(dbl1.time[current], dbl1.nanos[current], READER_ZONE), msg); + assertFalse(dbl1.isUTC(), EXPECT_LOCAL.formatted("dbl1")); assertEquals(expected2, timestampToString(dbl2.time[current], dbl2.nanos[current], WRITER_ZONE), msg); + assertTrue(dbl2.isUTC(), EXPECT_UTC.formatted("dbl2")); assertEquals(expectedDate1, timestampToString(dt1.time[current], dt1.nanos[current], READER_ZONE), msg); + assertFalse(dt1.isUTC(), EXPECT_LOCAL.formatted("dt1")); assertEquals(expectedDate2, timestampToString(dt2.time[current], dt2.nanos[current], UTC), msg); + assertTrue(dt2.isUTC(), EXPECT_UTC.formatted("dt2")); assertEquals(expected1, timestampToString(s1.time[current], s1.nanos[current], READER_ZONE), msg); + assertFalse(s1.isUTC(), EXPECT_LOCAL.formatted("s1")); assertEquals(expected2, timestampToString(s2.time[current], s2.nanos[current], WRITER_ZONE), msg); + assertTrue(s2.isUTC(), EXPECT_UTC.formatted("s2")); current += 1; } assertFalse(rows.nextBatch(batch)); @@ -2489,42 +2503,52 @@ public void testEvolutionToTimestamp() throws Exception { assertEquals(expected1.replace(".1 ", " "), timestampToString(l1.time[current], l1.nanos[current], UTC), msg); + assertTrue(l1.isUTC(), EXPECT_UTC.formatted("l1")); assertEquals(expected2.replace(".1 ", " "), timestampToString(l2.time[current], l2.nanos[current], WRITER_ZONE), msg); + assertTrue(l2.isUTC(), EXPECT_UTC.formatted("l2")); assertEquals(expected1, timestampToString(d1.time[current], d1.nanos[current], UTC), msg); + assertTrue(d1.isUTC(), EXPECT_UTC.formatted("d1")); assertEquals(expected2, timestampToString(d2.time[current], d2.nanos[current], WRITER_ZONE), msg); + assertTrue(d2.isUTC(), EXPECT_UTC.formatted("d2")); assertEquals(expected1, timestampToString(dbl1.time[current], dbl1.nanos[current], UTC), msg); + assertTrue(dbl1.isUTC(), EXPECT_UTC.formatted("dbl1")); assertEquals(expected2, timestampToString(dbl2.time[current], dbl2.nanos[current], WRITER_ZONE), msg); + assertTrue(dbl2.isUTC(), EXPECT_UTC.formatted("dbl2")); assertEquals(expectedDate, timestampToString(dt1.time[current], dt1.nanos[current], UTC), msg); + assertTrue(dt1.isUTC(), EXPECT_UTC.formatted("dt1")); assertEquals(expectedDate, timestampToString(dt2.time[current], dt2.nanos[current], UTC), msg); + assertTrue(dt2.isUTC(), EXPECT_UTC.formatted("dt2")); assertEquals(expected1, timestampToString(s1.time[current], s1.nanos[current], UTC), msg); + assertTrue(s1.isUTC(), EXPECT_UTC.formatted("s1")); assertEquals(expected2, timestampToString(s2.time[current], s2.nanos[current], WRITER_ZONE), msg); + assertTrue(s2.isUTC(), EXPECT_UTC.formatted("s2")); current += 1; } assertFalse(rows.nextBatch(batch)); diff --git a/java/core/src/test/org/apache/orc/impl/TestWriterImpl.java b/java/core/src/test/org/apache/orc/impl/TestWriterImpl.java index e5d2616cc6..58236502d4 100644 --- a/java/core/src/test/org/apache/orc/impl/TestWriterImpl.java +++ b/java/core/src/test/org/apache/orc/impl/TestWriterImpl.java @@ -19,9 +19,9 @@ package org.apache.orc.impl; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.orc.OrcConf; @@ -30,9 +30,15 @@ import org.apache.orc.TypeDescription; import org.apache.orc.Writer; import org.apache.orc.*; +import org.apache.orc.geospatial.BoundingBox; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; +import org.locationtech.jts.io.ParseException; +import org.locationtech.jts.io.WKBWriter; +import org.locationtech.jts.io.WKTReader; import java.io.IOException; @@ -40,20 +46,17 @@ import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertFalse; -public class TestWriterImpl { +public class TestWriterImpl implements TestConf { Path workDir = new Path(System.getProperty("test.tmp.dir")); - Configuration conf; FileSystem fs; Path testFilePath; TypeDescription schema; @BeforeEach public void openFileSystem() throws Exception { - conf = new Configuration(); fs = FileSystem.getLocal(conf); - fs.setWorkingDirectory(workDir); - testFilePath = new Path("testWriterImpl.orc"); + testFilePath = new Path(workDir, "testWriterImpl.orc"); fs.create(testFilePath, true); schema = TypeDescription.fromString("struct"); } @@ -182,6 +185,59 @@ public void testStripeRowCountLimit() throws Exception { assertEquals(10, w.getStripes().size()); } + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testGeospatialColumnStatistics(boolean useFilter) throws IOException, ParseException { + conf.set(OrcConf.OVERWRITE_OUTPUT_FILE.getAttribute(), "true"); + // Use the Geometry type + schema = TypeDescription.createGeometry(); + Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).setSchema(schema)); + VectorizedRowBatch batch = schema.createRowBatch(); + BytesColumnVector geomColumn = (BytesColumnVector) batch.cols[0]; + + WKTReader wktReader = new WKTReader(); + WKBWriter wkbWriter = new WKBWriter(); + byte[] point1 = wkbWriter.write(wktReader.read("POINT (1 2)")); + byte[] point2 = wkbWriter.write(wktReader.read("POINT (3 4)")); + byte[] point3 = wkbWriter.write(wktReader.read("POINT (5 6)")); + byte[] point4 = wkbWriter.write(wktReader.read("POINT (7 8)")); + + geomColumn.setVal(0, point1); + geomColumn.setVal(1, point2); + geomColumn.setVal(2, point3); + geomColumn.setVal(3, point4); + + if (useFilter) { + int[] selected = {2}; + batch.setFilterContext(true, selected, selected.length); + } else { + batch.size = 4; + } + writer.addRowBatch(batch); + writer.close(); + + Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf)); + ColumnStatistics[] statistics = reader.getStatistics(); + GeospatialColumnStatistics geometryStatistics = (GeospatialColumnStatistics) statistics[0]; + BoundingBox bbox = geometryStatistics.getBoundingBox(); + if (useFilter) { + assertEquals(5.0, bbox.getXMin()); + assertEquals(5.0, bbox.getXMax()); + assertEquals(6.0, bbox.getYMin()); + assertEquals(6.0, bbox.getYMax()); + } else { + assertEquals(1.0, bbox.getXMin()); + assertEquals(7.0, bbox.getXMax()); + assertEquals(2.0, bbox.getYMin()); + assertEquals(8.0, bbox.getYMax()); + } + assertEquals(Double.NaN, bbox.getZMin()); + assertEquals(Double.NaN, bbox.getZMax()); + assertEquals(Double.NaN, bbox.getMMin()); + assertEquals(Double.NaN, bbox.getMMax()); + reader.close(); + } + @Test public void testCloseIsIdempotent() throws IOException { conf.set(OrcConf.OVERWRITE_OUTPUT_FILE.getAttribute(), "true"); diff --git a/java/core/src/test/org/apache/orc/impl/TestZlib.java b/java/core/src/test/org/apache/orc/impl/TestZlib.java index 4ca62ca2af..f9d5936bc1 100644 --- a/java/core/src/test/org/apache/orc/impl/TestZlib.java +++ b/java/core/src/test/org/apache/orc/impl/TestZlib.java @@ -18,16 +18,24 @@ package org.apache.orc.impl; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.orc.CompressionCodec; +import org.apache.orc.OrcFile; +import org.apache.orc.Reader; +import org.apache.orc.RecordReader; +import org.apache.orc.TestConf; import org.junit.jupiter.api.Test; import java.io.IOException; import java.nio.ByteBuffer; import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.fail; -public class TestZlib { +public class TestZlib implements TestConf { @Test public void testNoOverflow() throws Exception { @@ -54,4 +62,23 @@ public void testCorrupt() throws Exception { // EXPECTED } } + + @Test + public void testCorruptZlibFile() { + Path testFilePath = new Path(ClassLoader. + getSystemResource("orc_corrupt_zlib.orc").getPath()); + + IOException exception = assertThrows( + IOException.class, + () -> { + try (Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf))) { + RecordReader rows = reader.rows(); + VectorizedRowBatch batch = reader.getSchema().createRowBatch(); + while (rows.nextBatch(batch)) { + } + } + } + ); + assertTrue(exception.getMessage().contains("Decompress output buffer too small")); + } } diff --git a/java/core/src/test/org/apache/orc/impl/filter/TestPluginFilterService.java b/java/core/src/test/org/apache/orc/impl/filter/TestPluginFilterService.java index 923910ded1..861cafa0e3 100644 --- a/java/core/src/test/org/apache/orc/impl/filter/TestPluginFilterService.java +++ b/java/core/src/test/org/apache/orc/impl/filter/TestPluginFilterService.java @@ -20,6 +20,8 @@ import org.apache.hadoop.conf.Configuration; import org.apache.orc.filter.BatchFilter; +import org.apache.orc.TestConf; +import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import java.lang.reflect.Method; @@ -31,11 +33,9 @@ import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.assertEquals; -public class TestPluginFilterService { - private final Configuration conf; - - public TestPluginFilterService() { - conf = new Configuration(); +public class TestPluginFilterService implements TestConf { + @BeforeEach + public void addFilter() { conf.set("my.filter.col.name", "f2"); conf.set("my.filter.col.value", "aBcd"); conf.set("my.filter.scope", "file://db/table1/.*"); diff --git a/java/core/src/test/org/apache/orc/impl/filter/TestPluginFilters.java b/java/core/src/test/org/apache/orc/impl/filter/TestPluginFilters.java index 85ec869dba..d08416de35 100644 --- a/java/core/src/test/org/apache/orc/impl/filter/TestPluginFilters.java +++ b/java/core/src/test/org/apache/orc/impl/filter/TestPluginFilters.java @@ -18,13 +18,13 @@ package org.apache.orc.impl.filter; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf; import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory; import org.apache.orc.OrcConf; import org.apache.orc.OrcFile; import org.apache.orc.Reader; +import org.apache.orc.TestConf; import org.apache.orc.filter.BatchFilter; import org.junit.jupiter.api.Test; @@ -32,7 +32,7 @@ import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; -public class TestPluginFilters extends ATestFilter { +public class TestPluginFilters extends ATestFilter implements TestConf { @Test public void testPluginFilterWithSArg() { @@ -40,7 +40,6 @@ public void testPluginFilterWithSArg() { new String[] {"a", "B", "c", "dE", "e", "f"}); // Define the plugin filter - Configuration conf = new Configuration(); OrcConf.ALLOW_PLUGIN_FILTER.setBoolean(conf, true); conf.set("my.filter.name", "my_str_i_eq"); conf.set("my.filter.col.name", "f2"); @@ -75,7 +74,6 @@ public void testPluginSelectsNone() { new String[] {"a", "B", "c", "dE", "e", "f"}); // Define the plugin filter - Configuration conf = new Configuration(); OrcConf.ALLOW_PLUGIN_FILTER.setBoolean(conf, true); conf.set("my.filter.name", "my_str_i_eq"); conf.set("my.filter.col.name", "f2"); @@ -109,7 +107,6 @@ public void testPluginDisabled() { new String[] {"a", "B", "c", "dE", "e", "f"}); // Define the plugin filter - Configuration conf = new Configuration(); OrcConf.ALLOW_PLUGIN_FILTER.setBoolean(conf, false); conf.set("my.filter.name", "my_str_i_eq"); conf.set("my.filter.col.name", "f2"); @@ -143,7 +140,6 @@ public void testPluginNonMatchingPath() { new String[] {"a", "B", "c", "dE", "e", "f"}); // Define the plugin filter - Configuration conf = new Configuration(); OrcConf.ALLOW_PLUGIN_FILTER.setBoolean(conf, true); conf.set("my.filter.name", "my_str_i_eq"); conf.set("my.filter.col.name", "f2"); @@ -177,7 +173,6 @@ public void testPluginSelectsAll() { new String[] {"abcdef", "Abcdef", "aBcdef", null, "abcDef", "abcdEf"}); // Define the plugin filter - Configuration conf = new Configuration(); OrcConf.ALLOW_PLUGIN_FILTER.setBoolean(conf, true); conf.set("my.filter.name", "my_str_i_eq"); conf.set("my.filter.col.name", "f2"); @@ -211,7 +206,6 @@ public void testPluginSameColumn() { new String[] {"abcdef", "Abcdef", "aBcdef", null, "abcDef", "abcdEf"}); // Define the plugin filter - Configuration conf = new Configuration(); OrcConf.ALLOW_PLUGIN_FILTER.setBoolean(conf, true); conf.set("my.filter.name", "my_str_i_eq"); conf.set("my.filter.col.name", "f2"); diff --git a/java/core/src/test/org/apache/orc/util/TestStreamWrapperFileSystem.java b/java/core/src/test/org/apache/orc/util/TestStreamWrapperFileSystem.java index f53b8f415a..f04ccec8d7 100644 --- a/java/core/src/test/org/apache/orc/util/TestStreamWrapperFileSystem.java +++ b/java/core/src/test/org/apache/orc/util/TestStreamWrapperFileSystem.java @@ -18,7 +18,6 @@ package org.apache.orc.util; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -27,6 +26,7 @@ import org.apache.orc.OrcFile; import org.apache.orc.Reader; import org.apache.orc.RecordReader; +import org.apache.orc.TestConf; import org.apache.orc.TestVectorOrcFile; import org.apache.orc.TypeDescription; import org.junit.jupiter.api.Test; @@ -40,11 +40,10 @@ /** * Tests for StreamWrapperFileSystem. */ -public class TestStreamWrapperFileSystem { +public class TestStreamWrapperFileSystem implements TestConf { @Test public void testWrapper() throws IOException { - Configuration conf = new Configuration(); Path realFilename = new Path(TestVectorOrcFile.getFileFromClasspath( "orc-file-11-format.orc")); FileSystem local = FileSystem.getLocal(conf); diff --git a/java/core/src/test/resources/log4j.properties b/java/core/src/test/resources/log4j.properties deleted file mode 100644 index 3979ce0787..0000000000 --- a/java/core/src/test/resources/log4j.properties +++ /dev/null @@ -1,20 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -log4j.rootLogger=WARN,stdout - -log4j.appender.stdout=org.apache.log4j.ConsoleAppender -log4j.appender.stdout.layout=org.apache.log4j.PatternLayout -log4j.appender.stdout.layout.ConversionPattern=%p\t%d{ISO8601}\t%r\t%c\t[%t]\t%m%n - -# Suppress the warnings about native io not being available -log4j.logger.org.apache.hadoop.util.NativeCodeLoader=ERROR diff --git a/java/core/src/test/resources/orc_corrupt_zlib.orc b/java/core/src/test/resources/orc_corrupt_zlib.orc new file mode 100644 index 0000000000..e083a07c84 Binary files /dev/null and b/java/core/src/test/resources/orc_corrupt_zlib.orc differ diff --git a/java/examples/pom.xml b/java/examples/pom.xml index 119e00b0d4..a36cd7a409 100644 --- a/java/examples/pom.xml +++ b/java/examples/pom.xml @@ -17,7 +17,7 @@ org.apache.orc orc - 2.1.0-SNAPSHOT + 2.3.0-SNAPSHOT ../pom.xml diff --git a/java/mapreduce/pom.xml b/java/mapreduce/pom.xml index 30bd83e5ba..2d803351ac 100644 --- a/java/mapreduce/pom.xml +++ b/java/mapreduce/pom.xml @@ -17,7 +17,7 @@ org.apache.orc orc - 2.1.0-SNAPSHOT + 2.3.0-SNAPSHOT ../pom.xml @@ -39,10 +39,6 @@ com.esotericsoftware kryo-shaded - - org.apache.commons - commons-lang3 - com.google.guava guava diff --git a/java/mapreduce/src/java/org/apache/orc/mapred/OrcInputFormat.java b/java/mapreduce/src/java/org/apache/orc/mapred/OrcInputFormat.java index ddb6a6ac2a..947d9b6f80 100644 --- a/java/mapreduce/src/java/org/apache/orc/mapred/OrcInputFormat.java +++ b/java/mapreduce/src/java/org/apache/orc/mapred/OrcInputFormat.java @@ -21,7 +21,6 @@ import com.esotericsoftware.kryo.Kryo; import com.esotericsoftware.kryo.io.Input; import com.esotericsoftware.kryo.io.Output; -import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; @@ -68,7 +67,7 @@ public static boolean[] parseInclude(TypeDescription schema, boolean[] result = new boolean[schema.getMaximumId() + 1]; result[0] = true; - if (StringUtils.isBlank(columnsStr)) { + if (columnsStr.isBlank()) { return result; } diff --git a/java/mapreduce/src/test/resources/log4j.properties b/java/mapreduce/src/test/resources/log4j.properties deleted file mode 100644 index 3979ce0787..0000000000 --- a/java/mapreduce/src/test/resources/log4j.properties +++ /dev/null @@ -1,20 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -log4j.rootLogger=WARN,stdout - -log4j.appender.stdout=org.apache.log4j.ConsoleAppender -log4j.appender.stdout.layout=org.apache.log4j.PatternLayout -log4j.appender.stdout.layout.ConversionPattern=%p\t%d{ISO8601}\t%r\t%c\t[%t]\t%m%n - -# Suppress the warnings about native io not being available -log4j.logger.org.apache.hadoop.util.NativeCodeLoader=ERROR diff --git a/java/pom.xml b/java/pom.xml index 578e69a53d..5a88d5877f 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -17,11 +17,11 @@ org.apache apache - 27 + 34 org.apache.orc orc - 2.1.0-SNAPSHOT + 2.3.0-SNAPSHOT pom Apache ORC @@ -60,31 +60,32 @@ - 1.78 - 1.16.0 - 10.15.0 + 1.80 + 1.18.0 + 10.26.1 ${project.basedir}/../../examples - 3.4.0 + 3.4.1 17 ${project.basedir}/../target/javadoc - 5.10.2 + 1.20.0 + 5.13.1 3.7.1 - 3.6.1 - 3.5.2 - 17 + 3.8.1 + 3.6.0 + ${java.version} false - 3.9.6 + 3.9.10 5.10.0 - 1.0.0 + 1.1.0 - 2024-01-08T16:47:56Z - 3.25.3 - 2.0.12 + 2025-07-08T15:22:41Z + 3.25.8 + 2.0.17 2.8.1 - 3.0.0-M5 + 3.5.3 ${project.build.directory}/testing-tmp - 1.5.6-2 + 1.5.7-4 @@ -98,7 +99,7 @@ org.apache.orc orc-shims - 2.1.0-SNAPSHOT + 2.3.0-SNAPSHOT org.apache.hadoop @@ -113,17 +114,17 @@ org.apache.orc orc-core - 2.1.0-SNAPSHOT + 2.3.0-SNAPSHOT org.apache.orc orc-mapreduce - 2.1.0-SNAPSHOT + 2.3.0-SNAPSHOT org.apache.orc orc-tools - 2.1.0-SNAPSHOT + 2.3.0-SNAPSHOT @@ -135,7 +136,7 @@ com.google.code.gson gson - 2.9.0 + 2.13.0 com.google.protobuf @@ -145,28 +146,24 @@ commons-cli commons-cli - 1.6.0 + 1.9.0 org.apache.commons commons-lang3 - 3.14.0 + 3.18.0 + test io.airlift aircompressor - 0.26 + 2.0.2 com.github.luben zstd-jni ${zstd-jni.version} - - org.apache.commons - commons-csv - 1.10.0 - org.apache.hadoop hadoop-client-api @@ -192,7 +189,8 @@ org.jetbrains annotations - 17.0.0 + 26.0.2 + provided org.slf4j @@ -208,7 +206,7 @@ org.threeten threeten-extra - 1.7.1 + 1.8.0 com.aayushatharva.brotli4j @@ -221,7 +219,7 @@ com.google.guava guava - 33.1.0-jre + 33.4.0-jre test @@ -251,19 +249,19 @@ org.objenesis objenesis - 3.2 + 3.3 test net.bytebuddy byte-buddy - 1.14.11 + 1.17.5 test net.bytebuddy byte-buddy-agent - 1.14.11 + 1.17.5 test @@ -340,7 +338,7 @@ com.diffplug.spotless spotless-maven-plugin - 2.43.0 + 2.44.4 @@ -375,7 +373,7 @@ org.apache.maven.plugins maven-jar-plugin - 3.3.0 + 3.4.2 @@ -395,7 +393,7 @@ com.github.spotbugs spotbugs-maven-plugin - 4.8.3.0 + 4.9.3.0 spotbugs-include.xml spotbugs-exclude.xml @@ -404,7 +402,7 @@ com.github.spotbugs spotbugs - 4.7.3 + 4.9.3 @@ -436,6 +434,7 @@ .idea/** **/*.iml **/dependency-reduced-pom.xml + .mvn/jvm.config @@ -450,7 +449,7 @@ org.apache.maven.plugins maven-checkstyle-plugin - 3.3.1 + 3.6.0 ${basedir}/src/java @@ -495,7 +494,7 @@ org.codehaus.mojo build-helper-maven-plugin - 3.5.0 + 3.6.0 add-source @@ -600,7 +599,7 @@ io.github.zlika reproducible-build-maven-plugin - 0.16 + 0.17 @@ -608,12 +607,12 @@ org.apache.maven.plugins maven-enforcer-plugin - 3.4.0 + 3.6.0 org.codehaus.mojo extra-enforcer-rules - 1.8.0 + 1.10.0 @@ -634,9 +633,6 @@ ${java.version} test provided - - org.threeten:threeten-extra - @@ -718,7 +714,7 @@ false false - -Xmx2048m -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED + -Xmx2048m -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED --enable-native-access=ALL-UNNAMED -XX:+EnableDynamicAgentLoading US/Pacific en_US.UTF-8 @@ -728,6 +724,8 @@ ${test.tmp.dir} ${example.dir} + error + @@ -750,7 +748,7 @@ org.cyclonedx cyclonedx-maven-plugin - 2.7.11 + 2.9.1 diff --git a/java/shims/pom.xml b/java/shims/pom.xml index 5e9231b970..1412164f1e 100644 --- a/java/shims/pom.xml +++ b/java/shims/pom.xml @@ -17,7 +17,7 @@ org.apache.orc orc - 2.1.0-SNAPSHOT + 2.3.0-SNAPSHOT ../pom.xml diff --git a/java/shims/src/java/org/apache/orc/impl/HadoopShims.java b/java/shims/src/java/org/apache/orc/impl/HadoopShims.java index 2ae0364f25..f79f353647 100644 --- a/java/shims/src/java/org/apache/orc/impl/HadoopShims.java +++ b/java/shims/src/java/org/apache/orc/impl/HadoopShims.java @@ -20,7 +20,6 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.util.VersionInfo; import org.apache.orc.EncryptionAlgorithm; import java.io.Closeable; @@ -132,9 +131,9 @@ ByteBuffer readBuffer(int maxLength, */ boolean endVariableLengthBlock(OutputStream output) throws IOException; - default boolean supportVectoredIO() { + default boolean supportVectoredIO(String version) { // HADOOP-18103 is available since Apache Hadoop 3.3.5+ - String[] versionParts = VersionInfo.getVersion().split("[.]"); + String[] versionParts = version.split("[.-]"); int major = Integer.parseInt(versionParts[0]); int minor = Integer.parseInt(versionParts[1]); int patch = Integer.parseInt(versionParts[2]); diff --git a/java/shims/src/test/org/apache/orc/impl/TestHadoopShimsPost3_3_4.java b/java/shims/src/test/org/apache/orc/impl/TestHadoopShimsPost3_3_4.java new file mode 100644 index 0000000000..774dac3c24 --- /dev/null +++ b/java/shims/src/test/org/apache/orc/impl/TestHadoopShimsPost3_3_4.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.impl; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +public class TestHadoopShimsPost3_3_4 { + + @Test + public void testOlderVersionForSupportVectoredIO() { + assertFalse(new HadoopShimsCurrent().supportVectoredIO("3.3.4")); + } + + @Test + public void testSupportedVersionForSupportVectoredIO() { + assertTrue(new HadoopShimsCurrent().supportVectoredIO("3.3.5")); + } + + @Test + public void testExtendedSemanticVersionForSupportVectoredIO() { + assertTrue(new HadoopShimsCurrent().supportVectoredIO("3.3.6-co-3")); + } +} diff --git a/java/shims/src/test/resources/log4j.properties b/java/shims/src/test/resources/log4j.properties deleted file mode 100644 index 3979ce0787..0000000000 --- a/java/shims/src/test/resources/log4j.properties +++ /dev/null @@ -1,20 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -log4j.rootLogger=WARN,stdout - -log4j.appender.stdout=org.apache.log4j.ConsoleAppender -log4j.appender.stdout.layout=org.apache.log4j.PatternLayout -log4j.appender.stdout.layout.ConversionPattern=%p\t%d{ISO8601}\t%r\t%c\t[%t]\t%m%n - -# Suppress the warnings about native io not being available -log4j.logger.org.apache.hadoop.util.NativeCodeLoader=ERROR diff --git a/java/spotbugs-exclude.xml b/java/spotbugs-exclude.xml index d5f7fe7d9c..26bec6fe95 100644 --- a/java/spotbugs-exclude.xml +++ b/java/spotbugs-exclude.xml @@ -74,4 +74,10 @@ + + + + + + diff --git a/java/tools/pom.xml b/java/tools/pom.xml index cc7cdd34f6..4b51d701bd 100644 --- a/java/tools/pom.xml +++ b/java/tools/pom.xml @@ -17,7 +17,7 @@ org.apache.orc orc - 2.1.0-SNAPSHOT + 2.3.0-SNAPSHOT ../pom.xml @@ -48,7 +48,7 @@ com.opencsv opencsv - 5.9 + 5.10 commons-beanutils @@ -60,10 +60,6 @@ commons-cli commons-cli - - org.apache.commons - commons-lang3 - org.apache.hive hive-storage-api @@ -80,6 +76,13 @@ + + org.apache.orc + orc-core + ${project.version} + tests + test + org.bouncycastle bcpkix-jdk18on diff --git a/java/tools/src/java/org/apache/orc/tools/ColumnSizes.java b/java/tools/src/java/org/apache/orc/tools/ColumnSizes.java index 2347ac7449..c4d5c29738 100644 --- a/java/tools/src/java/org/apache/orc/tools/ColumnSizes.java +++ b/java/tools/src/java/org/apache/orc/tools/ColumnSizes.java @@ -216,7 +216,11 @@ public static void main(Configuration conf, String[] args) throws Exception { } public static void main(String[] args) throws Exception { - main(new Configuration(), args); + Configuration conf = new Configuration(); + if (Runtime.version().feature() > 21) { + conf.setIfUnset("fs.file.impl.disable.cache", "true"); + } + main(conf, args); } private static Options createOptions() { diff --git a/java/tools/src/java/org/apache/orc/tools/Driver.java b/java/tools/src/java/org/apache/orc/tools/Driver.java index 0d2778b410..cdf594fe27 100644 --- a/java/tools/src/java/org/apache/orc/tools/Driver.java +++ b/java/tools/src/java/org/apache/orc/tools/Driver.java @@ -77,6 +77,7 @@ static class DriverOptions { } public static void main(String[] args) throws Exception { + System.setProperty("org.slf4j.simpleLogger.log.org.apache.hadoop", "error"); DriverOptions options = new DriverOptions(args); if (options.command == null) { @@ -102,6 +103,9 @@ public static void main(String[] args) throws Exception { System.exit(1); } Configuration conf = new Configuration(); + if (Runtime.version().feature() > 21) { + conf.setIfUnset("fs.file.impl.disable.cache", "true"); + } Properties confSettings = options.genericOptions.getOptionProperties("D"); for(Map.Entry pair: confSettings.entrySet()) { conf.set(pair.getKey().toString(), pair.getValue().toString()); diff --git a/java/tools/src/java/org/apache/orc/tools/FileDump.java b/java/tools/src/java/org/apache/orc/tools/FileDump.java index c235053106..7e952effb6 100644 --- a/java/tools/src/java/org/apache/orc/tools/FileDump.java +++ b/java/tools/src/java/org/apache/orc/tools/FileDump.java @@ -22,7 +22,6 @@ import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; -import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; @@ -61,7 +60,7 @@ */ public final class FileDump { public static final String UNKNOWN = "UNKNOWN"; - public static final String SEPARATOR = StringUtils.repeat("_", 120) + "\n"; + public static final String SEPARATOR = "_".repeat(120) + "\n"; public static final String RECOVER_READ_SIZE = "orc.recover.read.size"; // only for testing public static final int DEFAULT_BLOCK_SIZE = 256 * 1024 * 1024; public static final String DEFAULT_BACKUP_PATH = System.getProperty("java.io.tmpdir"); @@ -134,13 +133,18 @@ public static void main(Configuration conf, String[] args) throws Exception { boolean prettyPrint = cli.hasOption('p'); JsonFileDump.printJsonMetaData(filesInPath, conf, rowIndexCols, prettyPrint, printTimeZone); } else { - printMetaData(filesInPath, conf, rowIndexCols, printTimeZone, recover, backupPath); + boolean printColumnType = cli.hasOption("column-type"); + printMetaData(filesInPath, conf, rowIndexCols, printTimeZone, recover, backupPath, + printColumnType); } } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); + if (Runtime.version().feature() > 21) { + conf.setIfUnset("fs.file.impl.disable.cache", "true"); + } main(conf, args); } @@ -268,11 +272,11 @@ public static Collection getAllFilesInPath(final Path path, private static void printMetaData(List files, Configuration conf, List rowIndexCols, boolean printTimeZone, final boolean recover, - final String backupPath) + final String backupPath, final boolean printColumnType) throws IOException { List corruptFiles = new ArrayList<>(); for (String filename : files) { - printMetaDataImpl(filename, conf, rowIndexCols, printTimeZone, corruptFiles); + printMetaDataImpl(filename, conf, rowIndexCols, printTimeZone, corruptFiles, printColumnType); System.out.println(SEPARATOR); } @@ -294,6 +298,15 @@ private static void printMetaData(List files, Configuration conf, } } + static void printColumnsType(TypeDescription schema) { + int maximumId = schema.getMaximumId(); + for (int c = schema.getId(); c < maximumId + 1; ++c) { + TypeDescription type = schema.findSubtype(c); + System.out.println(" Column " + type.getId() + ": field: " + type.getFullFieldName() + + " type: " + type.toString()); + } + } + static void printTypeAnnotations(TypeDescription type, String prefix) { List attributes = type.getAttributeNames(); if (attributes.size() > 0) { @@ -329,7 +342,7 @@ static void printTypeAnnotations(TypeDescription type, String prefix) { private static void printMetaDataImpl(final String filename, final Configuration conf, List rowIndexCols, final boolean printTimeZone, - final List corruptFiles) throws IOException { + final List corruptFiles, final boolean printColumnType) throws IOException { Path file = new Path(filename); Reader reader = getReader(file, conf, corruptFiles); // if we can create reader then footer is not corrupt and file will readable @@ -351,15 +364,20 @@ private static void printMetaDataImpl(final String filename, ? "Proleptic Gregorian" : "Julian/Gregorian")); System.out.println("Type: " + reader.getSchema().toString()); + if (printColumnType) { + System.out.println("Columns type:"); + printColumnsType(reader.getSchema()); + } printTypeAnnotations(reader.getSchema(), "root"); System.out.println("\nStripe Statistics:"); List stripeStats = reader.getStripeStatistics(); for (int n = 0; n < stripeStats.size(); n++) { System.out.println(" Stripe " + (n + 1) + ":"); StripeStatistics ss = stripeStats.get(n); - for (int i = 0; i < ss.getColumnStatistics().length; ++i) { + ColumnStatistics[] columnStatistics = ss.getColumnStatistics(); + for (int i = 0; i < columnStatistics.length; ++i) { System.out.println(" Column " + i + ": " + - ss.getColumnStatistics()[i].toString()); + columnStatistics[i].toString()); } } ColumnStatistics[] stats = reader.getStatistics(); @@ -834,6 +852,11 @@ static Options createOptions() { .desc("specify a backup path to store the corrupted files (default: /tmp)") .hasArg() .build()); + + result.addOption(Option.builder() + .longOpt("column-type") + .desc("Print the column id, name and type of each column") + .build()); return result; } diff --git a/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java b/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java index d6166ea91d..7d893a54c4 100644 --- a/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java +++ b/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java @@ -49,6 +49,8 @@ import java.io.IOException; import java.io.StringWriter; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; @@ -110,10 +112,11 @@ public static void printJsonMetaData(List files, writer.name("stripeNumber").value(n + 1); StripeStatistics ss = stripeStatistics.get(n); writer.name("columnStatistics").beginArray(); - for (int i = 0; i < ss.getColumnStatistics().length; i++) { + ColumnStatistics[] columnStatistics = ss.getColumnStatistics(); + for (int i = 0; i < columnStatistics.length; i++) { writer.beginObject(); writer.name("columnId").value(i); - writeColumnStatistics(writer, ss.getColumnStatistics()[i]); + writeColumnStatistics(writer, columnStatistics[i]); writer.endObject(); } writer.endArray(); @@ -222,6 +225,17 @@ public static void printJsonMetaData(List files, writer.name("numDeletes").value(acidStats.deletes); writer.name("numUpdates").value(acidStats.updates); } + List keys = reader.getMetadataKeys(); + keys.remove(OrcAcidUtils.ACID_STATS); + if (!keys.isEmpty()) { + writer.name("userMetadata").beginObject(); + for (String key : keys) { + writer.name(key); + ByteBuffer byteBuffer = reader.getMetadataValue(key); + writer.value(String.valueOf(StandardCharsets.UTF_8.decode(byteBuffer))); + } + writer.endObject(); + } writer.name("status").value("OK"); rows.close(); diff --git a/java/tools/src/java/org/apache/orc/tools/PrintData.java b/java/tools/src/java/org/apache/orc/tools/PrintData.java index 37a7209421..6c7c18ba15 100644 --- a/java/tools/src/java/org/apache/orc/tools/PrintData.java +++ b/java/tools/src/java/org/apache/orc/tools/PrintData.java @@ -238,6 +238,7 @@ private static Options getOptions() { .build(); Option linesOpt = Option.builder("n").longOpt("lines") .argName("LINES") + .desc("Sets lines of data to be printed") .hasArg() .build(); diff --git a/java/tools/src/java/org/apache/orc/tools/RowCount.java b/java/tools/src/java/org/apache/orc/tools/RowCount.java index fce0db3f4c..779b90b853 100644 --- a/java/tools/src/java/org/apache/orc/tools/RowCount.java +++ b/java/tools/src/java/org/apache/orc/tools/RowCount.java @@ -72,7 +72,11 @@ public static void main(Configuration conf, String[] args) throws Exception { } public static void main(String[] args) throws Exception { - main(new Configuration(), args); + Configuration conf = new Configuration(); + if (Runtime.version().feature() > 21) { + conf.setIfUnset("fs.file.impl.disable.cache", "true"); + } + main(conf, args); } private static Options createOptions() { diff --git a/java/tools/src/java/org/apache/orc/tools/convert/ConvertTool.java b/java/tools/src/java/org/apache/orc/tools/convert/ConvertTool.java index fbdb8696de..f7e9bb1054 100644 --- a/java/tools/src/java/org/apache/orc/tools/convert/ConvertTool.java +++ b/java/tools/src/java/org/apache/orc/tools/convert/ConvertTool.java @@ -199,7 +199,7 @@ public ConvertTool(Configuration conf, this.csvHeaderLines = getIntOption(opts, 'H', 0); this.csvNullString = opts.getOptionValue('n', ""); this.timestampFormat = opts.getOptionValue("t", DEFAULT_TIMESTAMP_FORMAT); - this.bloomFilterColumns = opts.getOptionValue('b', null); + this.bloomFilterColumns = opts.getOptionValue('b'); this.unionTag = opts.getOptionValue("union-tag", "tag"); this.unionValue = opts.getOptionValue("union-value", "value"); String outFilename = opts.hasOption('o') diff --git a/java/tools/src/java/org/apache/orc/tools/json/JsonSchemaFinder.java b/java/tools/src/java/org/apache/orc/tools/json/JsonSchemaFinder.java index 358eb21a5d..7a07650493 100644 --- a/java/tools/src/java/org/apache/orc/tools/json/JsonSchemaFinder.java +++ b/java/tools/src/java/org/apache/orc/tools/json/JsonSchemaFinder.java @@ -29,7 +29,6 @@ import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; -import org.apache.commons.lang3.exception.ExceptionUtils; import org.apache.hadoop.conf.Configuration; import org.apache.orc.TypeDescription; import org.apache.orc.TypeDescriptionPrettyPrint; @@ -40,6 +39,8 @@ import java.io.IOException; import java.io.InputStreamReader; import java.io.PrintStream; +import java.io.PrintWriter; +import java.io.StringWriter; import java.math.BigInteger; import java.nio.charset.StandardCharsets; import java.util.List; @@ -264,6 +265,15 @@ public void addFile(java.io.Reader reader, String filename) { } } + public static String getStackTrace(final Throwable throwable) { + if (throwable == null) { + return ""; + } + final StringWriter sw = new StringWriter(); + throwable.printStackTrace(new PrintWriter(sw, true)); + return sw.toString(); + } + private void printParseExceptionMsg(JsonParseException e, String filename) { System.err.printf( "A JsonParseException was thrown while processing the %dth record of file %s.%n", @@ -282,7 +292,7 @@ private void printParseExceptionMsg(JsonParseException e, String filename) { System.exit(1); } } - System.err.printf("Please check the file.%n%n%s%n", ExceptionUtils.getStackTrace(e)); + System.err.printf("Please check the file.%n%n%s%n", getStackTrace(e)); System.exit(1); } diff --git a/java/tools/src/test/org/apache/orc/impl/TestRLEv2.java b/java/tools/src/test/org/apache/orc/impl/TestRLEv2.java index 2c9b7e5555..69a6656e6b 100644 --- a/java/tools/src/test/org/apache/orc/impl/TestRLEv2.java +++ b/java/tools/src/test/org/apache/orc/impl/TestRLEv2.java @@ -17,7 +17,6 @@ */ package org.apache.orc.impl; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; @@ -27,6 +26,7 @@ import org.apache.orc.PhysicalWriter; import org.apache.orc.Reader; import org.apache.orc.RecordReader; +import org.apache.orc.TestConf; import org.apache.orc.TypeDescription; import org.apache.orc.Writer; import org.apache.orc.impl.writer.StreamOptions; @@ -50,16 +50,14 @@ import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertTrue; -public class TestRLEv2 { +public class TestRLEv2 implements TestConf { Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" + File.separator + "tmp")); Path testFilePath; - Configuration conf; FileSystem fs; @BeforeEach public void openFileSystem (TestInfo testInfo) throws Exception { - conf = new Configuration(); fs = FileSystem.getLocal(conf); testFilePath = new Path(workDir, "TestRLEv2." + testInfo.getTestMethod().get().getName() + ".orc"); diff --git a/java/tools/src/test/org/apache/orc/tools/TestCheckTool.java b/java/tools/src/test/org/apache/orc/tools/TestCheckTool.java index a524f7a505..9787867061 100644 --- a/java/tools/src/test/org/apache/orc/tools/TestCheckTool.java +++ b/java/tools/src/test/org/apache/orc/tools/TestCheckTool.java @@ -18,13 +18,13 @@ package org.apache.orc.tools; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.orc.OrcFile; +import org.apache.orc.TestConf; import org.apache.orc.TypeDescription; import org.apache.orc.Writer; import org.apache.orc.tools.CheckTool; @@ -32,24 +32,22 @@ import org.junit.jupiter.api.Test; import java.io.ByteArrayOutputStream; +import java.io.File; import java.io.IOException; import java.io.PrintStream; import java.nio.charset.StandardCharsets; import static org.junit.jupiter.api.Assertions.assertTrue; -public class TestCheckTool { +public class TestCheckTool implements TestConf { private Path workDir = new Path(System.getProperty("test.tmp.dir")); - private Configuration conf; private FileSystem fs; private Path testFilePath; @BeforeEach public void openFileSystem() throws Exception { - conf = new Configuration(); fs = FileSystem.getLocal(conf); - fs.setWorkingDirectory(workDir); - testFilePath = new Path("TestCheckTool.testCheckTool.orc"); + testFilePath = new Path(workDir + File.separator + "TestCheckTool.testCheckTool.orc"); fs.delete(testFilePath, false); createFile(); } diff --git a/java/tools/src/test/org/apache/orc/tools/TestColumnSizes.java b/java/tools/src/test/org/apache/orc/tools/TestColumnSizes.java index b28c2c308e..02a9d2388d 100644 --- a/java/tools/src/test/org/apache/orc/tools/TestColumnSizes.java +++ b/java/tools/src/test/org/apache/orc/tools/TestColumnSizes.java @@ -18,13 +18,13 @@ package org.apache.orc.tools; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.orc.OrcFile; +import org.apache.orc.TestConf; import org.apache.orc.TypeDescription; import org.apache.orc.Writer; import org.apache.orc.tools.ColumnSizes; @@ -32,6 +32,7 @@ import org.junit.jupiter.api.Test; import java.io.ByteArrayOutputStream; +import java.io.File; import java.io.PrintStream; import java.nio.charset.StandardCharsets; import java.nio.file.Paths; @@ -40,17 +41,14 @@ import static org.junit.jupiter.api.Assertions.assertTrue; -public class TestColumnSizes { +public class TestColumnSizes implements TestConf { private Path workDir = new Path( Paths.get(System.getProperty("test.tmp.dir"), "orc-test-sizes").toString()); - private Configuration conf; private FileSystem fs; @BeforeEach public void openFileSystem() throws Exception { - conf = new Configuration(); fs = FileSystem.getLocal(conf); - fs.setWorkingDirectory(workDir); fs.mkdirs(workDir); fs.deleteOnExit(workDir); } @@ -59,8 +57,8 @@ public void openFileSystem() throws Exception { public void testSizes() throws Exception { TypeDescription schema = TypeDescription.fromString("struct"); Map fileToRowCountMap = new LinkedHashMap<>(); - fileToRowCountMap.put("test-sizes-1.orc", 10000); - fileToRowCountMap.put("test-sizes-2.orc", 20000); + fileToRowCountMap.put(workDir + File.separator + "test-sizes-1.orc", 10000); + fileToRowCountMap.put(workDir + File.separator + "test-sizes-2.orc", 20000); for (Map.Entry fileToRowCount : fileToRowCountMap.entrySet()) { Writer writer = OrcFile.createWriter(new Path(fileToRowCount.getKey()), OrcFile.writerOptions(conf) diff --git a/java/tools/src/test/org/apache/orc/tools/TestFileDump.java b/java/tools/src/test/org/apache/orc/tools/TestFileDump.java index c265a7400e..fc4a90c8e2 100644 --- a/java/tools/src/test/org/apache/orc/tools/TestFileDump.java +++ b/java/tools/src/test/org/apache/orc/tools/TestFileDump.java @@ -18,10 +18,10 @@ package org.apache.orc.tools; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DateColumnVector; import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector; @@ -39,6 +39,7 @@ import org.apache.orc.OrcFile; import org.apache.orc.Reader; import org.apache.orc.StripeStatistics; +import org.apache.orc.TestConf; import org.apache.orc.TypeDescription; import org.apache.orc.Writer; import org.junit.jupiter.api.BeforeEach; @@ -58,6 +59,7 @@ import java.nio.file.Paths; import java.sql.Timestamp; import java.text.SimpleDateFormat; +import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; @@ -73,19 +75,16 @@ import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assumptions.assumeTrue; -public class TestFileDump { +public class TestFileDump implements TestConf { Path workDir = new Path(System.getProperty("test.tmp.dir")); - Configuration conf; FileSystem fs; Path testFilePath; @BeforeEach public void openFileSystem () throws Exception { - conf = new Configuration(); fs = FileSystem.getLocal(conf); - fs.setWorkingDirectory(workDir); - testFilePath = new Path("TestFileDump.testDump.orc"); + testFilePath = new Path(workDir + File.separator + "TestFileDump.testDump.orc"); fs.delete(testFilePath, false); } @@ -231,6 +230,7 @@ public static void checkOutput(String expected, TestJsonFileDump.getFileFromClasspath(expected)), StandardCharsets.UTF_8); BufferedReader aStream = Files.newBufferedReader(Paths.get(actual), StandardCharsets.UTF_8); Object expectedLine = preprocessLine(eStream.readLine()); + final String[] filenames = {"Structure for", "\"fileName\":"}; while (expectedLine != null) { Object actualLine = preprocessLine(aStream.readLine()); if (expectedLine instanceof Long && actualLine instanceof Long) { @@ -238,7 +238,10 @@ public static void checkOutput(String expected, assertTrue(diff < SIZE_SLOP, "expected: " + expectedLine + ", actual: " + actualLine); } else { - assertEquals(expectedLine, actualLine); + String line = (String)expectedLine; + if (!Arrays.stream(filenames).anyMatch(s -> line.startsWith(s))) { // Ignore file path + assertEquals(expectedLine, actualLine); + } } expectedLine = preprocessLine(eStream.readLine()); } @@ -384,7 +387,6 @@ public void testDataDump() throws Exception { @Test public void testDictionaryThreshold() throws Exception { TypeDescription schema = getMyRecordType(); - Configuration conf = new Configuration(); conf.set(OrcConf.ENCODING_STRATEGY.getAttribute(), "COMPRESSION"); conf.setFloat(OrcConf.DICTIONARY_KEY_SIZE_THRESHOLD.getAttribute(), 0.49f); Writer writer = OrcFile.createWriter(testFilePath, @@ -752,9 +754,7 @@ public void testRecover() throws Exception { long fileSize = fs.getFileStatus(testFilePath).getLen(); - String testFilePathStr = Path.mergePaths( - workDir, Path.mergePaths(new Path(Path.SEPARATOR), testFilePath)) - .toUri().getPath(); + String testFilePathStr = testFilePath.toUri().getPath(); String copyTestFilePathStr = Path.mergePaths( workDir, Path.mergePaths(new Path(Path.SEPARATOR), @@ -827,6 +827,74 @@ public void testDoubleNaNAndInfinite() throws Exception { assertEquals("{\"x\":12.34}", lines[2]); } + @Test + public void testDumpColumnType() throws Exception { + TypeDescription schema = + TypeDescription.fromString("struct"); + Writer writer = OrcFile.createWriter(testFilePath, + OrcFile.writerOptions(conf) + .fileSystem(fs) + .setSchema(schema)); + + VectorizedRowBatch batch = schema.createRowBatch(); + LongColumnVector a = (LongColumnVector) batch.cols[0]; + LongColumnVector b = (LongColumnVector) batch.cols[1]; + LongColumnVector c = (LongColumnVector) batch.cols[2]; + LongColumnVector d = (LongColumnVector) batch.cols[3]; + LongColumnVector e = (LongColumnVector) batch.cols[4]; + DoubleColumnVector f = (DoubleColumnVector) batch.cols[5]; + DoubleColumnVector g = (DoubleColumnVector) batch.cols[6]; + BytesColumnVector h = (BytesColumnVector) batch.cols[7]; + DateColumnVector i = (DateColumnVector) batch.cols[8]; + TimestampColumnVector j = (TimestampColumnVector) batch.cols[9]; + BytesColumnVector k = (BytesColumnVector) batch.cols[10]; + DecimalColumnVector l = (DecimalColumnVector) batch.cols[11]; + BytesColumnVector m = (BytesColumnVector) batch.cols[12]; + BytesColumnVector n = (BytesColumnVector) batch.cols[13]; + + for (int o = 0; o < VectorizedRowBatch.DEFAULT_SIZE * 2; o++) { + int row = batch.size++; + a.vector[row] = row % 2; + b.vector[row] = row % 128; + c.vector[row] = row; + d.vector[row] = row; + e.vector[row] = row * 10000000L; + f.vector[row] = row * 1.0f; + g.vector[row] = row * 1.0d; + byte[] bytes = String.valueOf(row).getBytes(StandardCharsets.UTF_8); + h.setRef(row, bytes, 0, bytes.length); + i.vector[row] = row; + j.time[row] = row * 1000L; + j.nanos[row] = row; + k.setRef(row, bytes, 0, bytes.length); + l.vector[row] = new HiveDecimalWritable(row); + m.setRef(row, bytes, 0, bytes.length); + bytes = String.valueOf(10000 - row).getBytes(StandardCharsets.UTF_8); + n.setRef(row, bytes, 0, bytes.length); + + if (batch.size == batch.getMaxSize()) { + writer.addRowBatch(batch); + batch.reset(); + } + } + writer.close(); + assertEquals(VectorizedRowBatch.DEFAULT_SIZE * 2, writer.getNumberOfRows()); + + PrintStream origOut = System.out; + String outputFilename = "orc-file-dump-column-type.out"; + FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename); + + // replace stdout and run command + System.setOut(new PrintStream(myOut, false, StandardCharsets.UTF_8.toString())); + FileDump.main(new String[]{testFilePath.toString(), "--column-type"}); + System.out.flush(); + System.setOut(origOut); + + checkOutput(outputFilename, workDir + File.separator + outputFilename); + } + private static boolean contentEquals(String filePath, String otherFilePath) throws IOException { try (InputStream is = new BufferedInputStream(new FileInputStream(filePath)); InputStream otherIs = new BufferedInputStream(new FileInputStream(otherFilePath))) { diff --git a/java/tools/src/test/org/apache/orc/tools/TestJsonFileDump.java b/java/tools/src/test/org/apache/orc/tools/TestJsonFileDump.java index 225d7c34d0..3f3354e3ef 100644 --- a/java/tools/src/test/org/apache/orc/tools/TestJsonFileDump.java +++ b/java/tools/src/test/org/apache/orc/tools/TestJsonFileDump.java @@ -18,7 +18,6 @@ package org.apache.orc.tools; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; @@ -28,6 +27,7 @@ import org.apache.orc.CompressionKind; import org.apache.orc.OrcConf; import org.apache.orc.OrcFile; +import org.apache.orc.TestConf; import org.apache.orc.TypeDescription; import org.apache.orc.Writer; import org.junit.jupiter.api.BeforeEach; @@ -42,7 +42,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals; -public class TestJsonFileDump { +public class TestJsonFileDump implements TestConf { public static String getFileFromClasspath(String name) { URL url = ClassLoader.getSystemResource(name); @@ -53,16 +53,13 @@ public static String getFileFromClasspath(String name) { } Path workDir = new Path(System.getProperty("test.tmp.dir")); - Configuration conf; FileSystem fs; Path testFilePath; @BeforeEach public void openFileSystem () throws Exception { - conf = new Configuration(); fs = FileSystem.getLocal(conf); - fs.setWorkingDirectory(workDir); - testFilePath = new Path("TestFileDump.testDump.orc"); + testFilePath = new Path(workDir + File.separator + "TestFileDump.testDump.orc"); fs.delete(testFilePath, false); } @@ -117,6 +114,10 @@ public void testJsonDump() throws Exception { writer.addRowBatch(batch); } + writer.addUserMetadata("hive.acid.key.index", + StandardCharsets.UTF_8.encode("1,1,1;2,3,5;")); + writer.addUserMetadata("some.user.property", + StandardCharsets.UTF_8.encode("foo#bar$baz&")); writer.close(); PrintStream origOut = System.out; String outputFilename = "orc-file-dump.json"; diff --git a/java/tools/src/test/org/apache/orc/tools/TestMergeFiles.java b/java/tools/src/test/org/apache/orc/tools/TestMergeFiles.java index bac2ee36c4..3fdfeba0c4 100644 --- a/java/tools/src/test/org/apache/orc/tools/TestMergeFiles.java +++ b/java/tools/src/test/org/apache/orc/tools/TestMergeFiles.java @@ -18,7 +18,6 @@ package org.apache.orc.tools; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; @@ -27,6 +26,7 @@ import org.apache.orc.CompressionKind; import org.apache.orc.OrcFile; import org.apache.orc.Reader; +import org.apache.orc.TestConf; import org.apache.orc.TypeDescription; import org.apache.orc.Writer; import org.apache.orc.tools.MergeFiles; @@ -34,6 +34,7 @@ import org.junit.jupiter.api.Test; import java.io.ByteArrayOutputStream; +import java.io.File; import java.io.PrintStream; import java.nio.charset.StandardCharsets; import java.nio.file.Paths; @@ -43,21 +44,18 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; -public class TestMergeFiles { +public class TestMergeFiles implements TestConf { private Path workDir = new Path( Paths.get(System.getProperty("test.tmp.dir"), "orc-test-merge").toString()); - private Configuration conf; private FileSystem fs; private Path testFilePath; @BeforeEach public void openFileSystem() throws Exception { - conf = new Configuration(); fs = FileSystem.getLocal(conf); - fs.setWorkingDirectory(workDir); fs.mkdirs(workDir); fs.deleteOnExit(workDir); - testFilePath = new Path("TestMergeFiles.testMerge.orc"); + testFilePath = new Path(workDir + File.separator + "TestMergeFiles.testMerge.orc"); fs.delete(testFilePath, false); } @@ -65,8 +63,8 @@ public void openFileSystem() throws Exception { public void testMerge() throws Exception { TypeDescription schema = TypeDescription.fromString("struct"); Map fileToRowCountMap = new LinkedHashMap<>(); - fileToRowCountMap.put("test-merge-1.orc", 10000); - fileToRowCountMap.put("test-merge-2.orc", 20000); + fileToRowCountMap.put(workDir + File.separator + "test-merge-1.orc", 10000); + fileToRowCountMap.put(workDir + File.separator + "test-merge-2.orc", 20000); for (Map.Entry fileToRowCount : fileToRowCountMap.entrySet()) { Writer writer = OrcFile.createWriter(new Path(fileToRowCount.getKey()), OrcFile.writerOptions(conf) diff --git a/java/tools/src/test/org/apache/orc/tools/TestRowCount.java b/java/tools/src/test/org/apache/orc/tools/TestRowCount.java index 5cf6ffc489..232a1c2b98 100644 --- a/java/tools/src/test/org/apache/orc/tools/TestRowCount.java +++ b/java/tools/src/test/org/apache/orc/tools/TestRowCount.java @@ -18,12 +18,12 @@ package org.apache.orc.tools; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.orc.OrcFile; +import org.apache.orc.TestConf; import org.apache.orc.TypeDescription; import org.apache.orc.Writer; import org.apache.orc.tools.RowCount; @@ -31,6 +31,7 @@ import org.junit.jupiter.api.Test; import java.io.ByteArrayOutputStream; +import java.io.File; import java.io.PrintStream; import java.nio.charset.StandardCharsets; import java.nio.file.Paths; @@ -39,17 +40,14 @@ import static org.junit.jupiter.api.Assertions.assertTrue; -public class TestRowCount { +public class TestRowCount implements TestConf { private Path workDir = new Path( Paths.get(System.getProperty("test.tmp.dir"), "orc-test-count").toString()); - private Configuration conf; private FileSystem fs; @BeforeEach public void openFileSystem() throws Exception { - conf = new Configuration(); fs = FileSystem.getLocal(conf); - fs.setWorkingDirectory(workDir); fs.mkdirs(workDir); fs.deleteOnExit(workDir); } @@ -58,8 +56,8 @@ public void openFileSystem() throws Exception { public void testCount() throws Exception { TypeDescription schema = TypeDescription.fromString("struct"); Map fileToRowCountMap = new LinkedHashMap<>(); - fileToRowCountMap.put("test-count-1.orc", 10000); - fileToRowCountMap.put("test-count-2.orc", 20000); + fileToRowCountMap.put(workDir + File.separator + "test-count-1.orc", 10000); + fileToRowCountMap.put(workDir + File.separator + "test-count-2.orc", 20000); for (Map.Entry fileToRowCount : fileToRowCountMap.entrySet()) { Writer writer = OrcFile.createWriter(new Path(fileToRowCount.getKey()), OrcFile.writerOptions(conf) diff --git a/java/tools/src/test/org/apache/orc/tools/TesScanData.java b/java/tools/src/test/org/apache/orc/tools/TestScanData.java similarity index 90% rename from java/tools/src/test/org/apache/orc/tools/TesScanData.java rename to java/tools/src/test/org/apache/orc/tools/TestScanData.java index df73abc900..93c9b77b3a 100644 --- a/java/tools/src/test/org/apache/orc/tools/TesScanData.java +++ b/java/tools/src/test/org/apache/orc/tools/TestScanData.java @@ -18,36 +18,34 @@ package org.apache.orc.tools; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.orc.OrcFile; +import org.apache.orc.TestConf; import org.apache.orc.TypeDescription; import org.apache.orc.Writer; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import java.io.ByteArrayOutputStream; +import java.io.File; import java.io.PrintStream; import java.nio.charset.StandardCharsets; import static org.junit.jupiter.api.Assertions.assertTrue; -public class TesScanData { +public class TestScanData implements TestConf { private Path workDir = new Path(System.getProperty("test.tmp.dir")); - private Configuration conf; private FileSystem fs; private Path testFilePath; @BeforeEach public void openFileSystem() throws Exception { - conf = new Configuration(); fs = FileSystem.getLocal(conf); - fs.setWorkingDirectory(workDir); - testFilePath = new Path("TesScanData.testScan.orc"); + testFilePath = new Path(workDir + File.separator + "TestScanData.testScan.orc"); fs.delete(testFilePath, false); } @@ -86,6 +84,6 @@ public void testScan() throws Exception { assertTrue(output.contains("{\"category\": \"struct\", \"id\": 0, \"max\": 2, \"fields\": [\n" + "{ \"x\": {\"category\": \"int\", \"id\": 1, \"max\": 1}},\n" + "{ \"y\": {\"category\": \"string\", \"id\": 2, \"max\": 2}}]}")); - assertTrue(output.contains("File: TesScanData.testScan.orc, bad batches: 0, rows: 10000/10000")); + assertTrue(output.contains("TestScanData.testScan.orc, bad batches: 0, rows: 10000/10000")); } } diff --git a/java/tools/src/test/org/apache/orc/tools/convert/TestConvert.java b/java/tools/src/test/org/apache/orc/tools/convert/TestConvert.java index 84ec10137f..f208485596 100644 --- a/java/tools/src/test/org/apache/orc/tools/convert/TestConvert.java +++ b/java/tools/src/test/org/apache/orc/tools/convert/TestConvert.java @@ -19,7 +19,6 @@ package org.apache.orc.tools.convert; import org.apache.commons.cli.ParseException; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -28,12 +27,14 @@ import org.apache.orc.OrcFile; import org.apache.orc.Reader; import org.apache.orc.RecordReader; +import org.apache.orc.TestConf; import org.apache.orc.TypeDescription; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import java.io.File; import java.io.IOException; import java.sql.Timestamp; import java.util.TimeZone; @@ -41,21 +42,18 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; -public class TestConvert { +public class TestConvert implements TestConf { public static final TimeZone DEFAULT_TIME_ZONE = TimeZone.getDefault(); Path workDir = new Path(System.getProperty("test.tmp.dir")); - Configuration conf; FileSystem fs; Path testFilePath; @BeforeEach public void openFileSystem () throws Exception { - conf = new Configuration(); fs = FileSystem.getLocal(conf); - fs.setWorkingDirectory(workDir); - testFilePath = new Path("TestConvert.testConvert.orc"); + testFilePath = new Path(workDir + File.separator + "TestConvert.testConvert.orc"); fs.delete(testFilePath, false); } diff --git a/java/tools/src/test/org/apache/orc/tools/convert/TestConvertORC.java b/java/tools/src/test/org/apache/orc/tools/convert/TestConvertORC.java index 37e944891d..0610ad4d92 100644 --- a/java/tools/src/test/org/apache/orc/tools/convert/TestConvertORC.java +++ b/java/tools/src/test/org/apache/orc/tools/convert/TestConvertORC.java @@ -19,7 +19,6 @@ package org.apache.orc.tools.convert; import org.apache.commons.cli.ParseException; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; @@ -28,11 +27,13 @@ import org.apache.orc.CompressionKind; import org.apache.orc.OrcFile; import org.apache.orc.Reader; +import org.apache.orc.TestConf; import org.apache.orc.TypeDescription; import org.apache.orc.Writer; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import java.io.File; import java.io.IOException; import java.nio.file.Paths; import java.util.HashMap; @@ -42,22 +43,19 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; -public class TestConvertORC { +public class TestConvertORC implements TestConf { private Path workDir = new Path( Paths.get(System.getProperty("test.tmp.dir"), "orc-test-convert-orc").toString()); - private Configuration conf; private FileSystem fs; private Path testFilePath; @BeforeEach public void openFileSystem () throws Exception { - conf = new Configuration(); fs = FileSystem.getLocal(conf); - fs.setWorkingDirectory(workDir); fs.mkdirs(workDir); fs.deleteOnExit(workDir); - testFilePath = new Path("TestConvertORC.testConvertORC.orc"); + testFilePath = new Path(workDir + File.separator + "TestConvertORC.testConvertORC.orc"); fs.delete(testFilePath, false); } @@ -65,11 +63,13 @@ public void openFileSystem () throws Exception { public void testConvertFromORC() throws IOException, ParseException { TypeDescription schema = TypeDescription.fromString("struct"); Map fileToRowCountMap = new LinkedHashMap<>(); - fileToRowCountMap.put("test-convert-1.orc", 10000); - fileToRowCountMap.put("test-convert-2.orc", 20000); + String file1 = workDir + File.separator + "test-convert-1.orc"; + String file2 = workDir + File.separator + "test-convert-2.orc"; + fileToRowCountMap.put(file1, 10000); + fileToRowCountMap.put(file2, 20000); Map fileToCompressMap = new HashMap<>(); - fileToCompressMap.put("test-convert-1.orc", CompressionKind.ZLIB); - fileToCompressMap.put("test-convert-2.orc", CompressionKind.SNAPPY); + fileToCompressMap.put(file1, CompressionKind.ZLIB); + fileToCompressMap.put(file2, CompressionKind.SNAPPY); for (Map.Entry fileToRowCount : fileToRowCountMap.entrySet()) { Writer writer = OrcFile.createWriter(new Path(fileToRowCount.getKey()), @@ -95,20 +95,20 @@ public void testConvertFromORC() throws IOException, ParseException { writer.close(); } - try (Reader reader = OrcFile.createReader(new Path("test-convert-1.orc"), OrcFile.readerOptions(conf))) { + try (Reader reader = OrcFile.createReader(new Path(file1), OrcFile.readerOptions(conf))) { assertEquals(schema, reader.getSchema()); assertEquals(CompressionKind.ZLIB, reader.getCompressionKind()); assertEquals(10000, reader.getNumberOfRows()); } - try (Reader reader = OrcFile.createReader(new Path("test-convert-2.orc"), OrcFile.readerOptions(conf))) { + try (Reader reader = OrcFile.createReader(new Path(file2), OrcFile.readerOptions(conf))) { assertEquals(schema, reader.getSchema()); assertEquals(CompressionKind.SNAPPY, reader.getCompressionKind()); assertEquals(20000, reader.getNumberOfRows()); } ConvertTool.main(conf, new String[]{"-o", testFilePath.toString(), - "test-convert-1.orc", "test-convert-2.orc"}); + file1, file2}); assertTrue(fs.exists(testFilePath)); try (Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf))) { diff --git a/java/tools/src/test/resources/log4j.properties b/java/tools/src/test/resources/log4j.properties deleted file mode 100644 index 8feed70c28..0000000000 --- a/java/tools/src/test/resources/log4j.properties +++ /dev/null @@ -1,21 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -log4j.rootLogger=WARN,stdout - -log4j.appender.stdout=org.apache.log4j.ConsoleAppender -log4j.appender.stdout.Target = System.err -log4j.appender.stdout.layout=org.apache.log4j.PatternLayout -log4j.appender.stdout.layout.ConversionPattern=%p\t%d{ISO8601}\t%r\t%c\t[%t]\t%m%n - -# Suppress the warnings about native io not being available -log4j.logger.org.apache.hadoop.util.NativeCodeLoader=ERROR diff --git a/java/tools/src/test/resources/orc-file-dump-column-type.out b/java/tools/src/test/resources/orc-file-dump-column-type.out new file mode 100644 index 0000000000..73267e7287 --- /dev/null +++ b/java/tools/src/test/resources/orc-file-dump-column-type.out @@ -0,0 +1,121 @@ +Structure for TestFileDump.testDump.orc +File Version: 0.12 with ORC_14 by ORC Java 2.1.0-SNAPSHOT +Rows: 2048 +Compression: ZSTD +Compression size: 262144 +Calendar: Julian/Gregorian +Type: struct +Columns type: + Column 0: field: 0 type: struct + Column 1: field: a type: boolean + Column 2: field: b type: tinyint + Column 3: field: c type: smallint + Column 4: field: d type: int + Column 5: field: e type: bigint + Column 6: field: f type: float + Column 7: field: g type: double + Column 8: field: h type: string + Column 9: field: i type: date + Column 10: field: j type: timestamp + Column 11: field: k type: binary + Column 12: field: l type: decimal(20,5) + Column 13: field: m type: varchar(5) + Column 14: field: n type: char(5) + +Stripe Statistics: + Stripe 1: + Column 0: count: 2048 hasNull: false + Column 1: count: 2048 hasNull: false bytesOnDisk: 7 true: 1024 + Column 2: count: 2048 hasNull: false bytesOnDisk: 152 min: 0 max: 127 sum: 130048 + Column 3: count: 2048 hasNull: false bytesOnDisk: 21 min: 0 max: 1023 sum: 1047552 + Column 4: count: 2048 hasNull: false bytesOnDisk: 21 min: 0 max: 1023 sum: 1047552 + Column 5: count: 2048 hasNull: false bytesOnDisk: 35 min: 0 max: 10230000000 sum: 10475520000000 + Column 6: count: 2048 hasNull: false bytesOnDisk: 2361 min: 0.0 max: 1023.0 sum: 1047552.0 + Column 7: count: 2048 hasNull: false bytesOnDisk: 973 min: 0.0 max: 1023.0 sum: 1047552.0 + Column 8: count: 2048 hasNull: false bytesOnDisk: 2988 min: 0 max: 999 sum: 5972 + Column 9: count: 2048 hasNull: false bytesOnDisk: 21 min: Hybrid AD 1970-01-01 max: Hybrid AD 1972-10-20 + Column 10: count: 2048 hasNull: false bytesOnDisk: 1626 min: 1969-12-31 16:00:00.0 max: 1969-12-31 16:17:03.000001023 + Column 11: count: 2048 hasNull: false bytesOnDisk: 1404 sum: 5972 + Column 12: count: 2048 hasNull: false bytesOnDisk: 1666 min: 0 max: 1023 sum: 1047552 + Column 13: count: 2048 hasNull: false bytesOnDisk: 2988 min: 0 max: 999 sum: 5972 + Column 14: count: 2048 hasNull: false bytesOnDisk: 1277 min: 10000 max: 9999 sum: 10240 + +File Statistics: + Column 0: count: 2048 hasNull: false + Column 1: count: 2048 hasNull: false bytesOnDisk: 7 true: 1024 + Column 2: count: 2048 hasNull: false bytesOnDisk: 152 min: 0 max: 127 sum: 130048 + Column 3: count: 2048 hasNull: false bytesOnDisk: 21 min: 0 max: 1023 sum: 1047552 + Column 4: count: 2048 hasNull: false bytesOnDisk: 21 min: 0 max: 1023 sum: 1047552 + Column 5: count: 2048 hasNull: false bytesOnDisk: 35 min: 0 max: 10230000000 sum: 10475520000000 + Column 6: count: 2048 hasNull: false bytesOnDisk: 2361 min: 0.0 max: 1023.0 sum: 1047552.0 + Column 7: count: 2048 hasNull: false bytesOnDisk: 973 min: 0.0 max: 1023.0 sum: 1047552.0 + Column 8: count: 2048 hasNull: false bytesOnDisk: 2988 min: 0 max: 999 sum: 5972 + Column 9: count: 2048 hasNull: false bytesOnDisk: 21 min: Hybrid AD 1970-01-01 max: Hybrid AD 1972-10-20 + Column 10: count: 2048 hasNull: false bytesOnDisk: 1626 min: 1969-12-31 16:00:00.0 max: 1969-12-31 16:17:03.000001023 + Column 11: count: 2048 hasNull: false bytesOnDisk: 1404 sum: 5972 + Column 12: count: 2048 hasNull: false bytesOnDisk: 1666 min: 0 max: 1023 sum: 1047552 + Column 13: count: 2048 hasNull: false bytesOnDisk: 2988 min: 0 max: 999 sum: 5972 + Column 14: count: 2048 hasNull: false bytesOnDisk: 1277 min: 10000 max: 9999 sum: 10240 + +Stripes: + Stripe: offset: 3 data: 15540 rows: 2048 tail: 225 index: 464 + Stream: column 0 section ROW_INDEX start: 3 length 12 + Stream: column 1 section ROW_INDEX start: 15 length 24 + Stream: column 2 section ROW_INDEX start: 39 length 28 + Stream: column 3 section ROW_INDEX start: 67 length 28 + Stream: column 4 section ROW_INDEX start: 95 length 28 + Stream: column 5 section ROW_INDEX start: 123 length 35 + Stream: column 6 section ROW_INDEX start: 158 length 45 + Stream: column 7 section ROW_INDEX start: 203 length 45 + Stream: column 8 section ROW_INDEX start: 248 length 30 + Stream: column 9 section ROW_INDEX start: 278 length 24 + Stream: column 10 section ROW_INDEX start: 302 length 35 + Stream: column 11 section ROW_INDEX start: 337 length 24 + Stream: column 12 section ROW_INDEX start: 361 length 39 + Stream: column 13 section ROW_INDEX start: 400 length 30 + Stream: column 14 section ROW_INDEX start: 430 length 37 + Stream: column 1 section DATA start: 467 length 7 + Stream: column 2 section DATA start: 474 length 152 + Stream: column 3 section DATA start: 626 length 21 + Stream: column 4 section DATA start: 647 length 21 + Stream: column 5 section DATA start: 668 length 35 + Stream: column 6 section DATA start: 703 length 2361 + Stream: column 7 section DATA start: 3064 length 973 + Stream: column 8 section DATA start: 4037 length 1575 + Stream: column 8 section LENGTH start: 5612 length 47 + Stream: column 8 section DICTIONARY_DATA start: 5659 length 1366 + Stream: column 9 section DATA start: 7025 length 21 + Stream: column 10 section DATA start: 7046 length 35 + Stream: column 10 section SECONDARY start: 7081 length 1591 + Stream: column 11 section DATA start: 8672 length 1368 + Stream: column 11 section LENGTH start: 10040 length 36 + Stream: column 12 section DATA start: 10076 length 1647 + Stream: column 12 section SECONDARY start: 11723 length 19 + Stream: column 13 section DATA start: 11742 length 1575 + Stream: column 13 section LENGTH start: 13317 length 47 + Stream: column 13 section DICTIONARY_DATA start: 13364 length 1366 + Stream: column 14 section DATA start: 14730 length 753 + Stream: column 14 section LENGTH start: 15483 length 11 + Stream: column 14 section DICTIONARY_DATA start: 15494 length 513 + Encoding column 0: DIRECT + Encoding column 1: DIRECT + Encoding column 2: DIRECT + Encoding column 3: DIRECT_V2 + Encoding column 4: DIRECT_V2 + Encoding column 5: DIRECT_V2 + Encoding column 6: DIRECT + Encoding column 7: DIRECT + Encoding column 8: DICTIONARY_V2[1024] + Encoding column 9: DIRECT_V2 + Encoding column 10: DIRECT_V2 + Encoding column 11: DIRECT_V2 + Encoding column 12: DIRECT_V2 + Encoding column 13: DICTIONARY_V2[1024] + Encoding column 14: DICTIONARY_V2[1024] + +File length: 16919 bytes +File raw data size: 1048404 bytes +Padding length: 0 bytes +Padding ratio: 0% +________________________________________________________________________________________________________________________ + diff --git a/java/tools/src/test/resources/orc-file-dump.json b/java/tools/src/test/resources/orc-file-dump.json index d94c59bb6a..15fdba74a8 100644 --- a/java/tools/src/test/resources/orc-file-dump.json +++ b/java/tools/src/test/resources/orc-file-dump.json @@ -2,7 +2,7 @@ "fileName": "TestFileDump.testDump.orc", "fileVersion": "0.12", "writerVersion": "ORC_14", - "softwareVersion": "ORC Java 1.9.0-SNAPSHOT", + "softwareVersion": "ORC Java 2.1.0-SNAPSHOT", "numberOfRows": 21000, "compression": "ZLIB", "compressionBufferSize": 4096, @@ -461,48 +461,48 @@ "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 }, { "entryId": 1, "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 }, { "entryId": 2, "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 }, { "entryId": 3, "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 }, { "entryId": 4, "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 } ], "stripeLevelBloomFilter": { "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 } } ] @@ -704,48 +704,48 @@ "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 }, { "entryId": 1, "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 }, { "entryId": 2, "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 }, { "entryId": 3, "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 }, { "entryId": 4, "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 } ], "stripeLevelBloomFilter": { "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 } } ] @@ -947,48 +947,48 @@ "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 }, { "entryId": 1, "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 }, { "entryId": 2, "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 }, { "entryId": 3, "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 }, { "entryId": 4, "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 } ], "stripeLevelBloomFilter": { "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 } } ] @@ -1190,48 +1190,48 @@ "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 }, { "entryId": 1, "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 }, { "entryId": 2, "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 }, { "entryId": 3, "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 }, { "entryId": 4, "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 } ], "stripeLevelBloomFilter": { "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 } } ] @@ -1361,16 +1361,16 @@ "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 } ], "stripeLevelBloomFilter": { "numHashFunctions": 7, "bitCount": 9600, "popCount": 238, - "loadFactor": 0.024791667237877846, - "expectedFpp": 5.756256582500896E-12 + "loadFactor": 0.024791667, + "expectedFpp": 5.7562566E-12 } } ] @@ -1380,5 +1380,9 @@ "rawDataSize": 2144730, "paddingLength": 0, "paddingRatio": 0.0, + "userMetadata": { + "hive.acid.key.index": "1,1,1;2,3,5;", + "some.user.property": "foo#bar$baz&" + }, "status": "OK" } diff --git a/meson.build b/meson.build new file mode 100644 index 0000000000..db23c300aa --- /dev/null +++ b/meson.build @@ -0,0 +1,42 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +project( + 'orc', + 'cpp', + version: '2.2.0-SNAPSHOT', + license: 'Apache-2.0', + meson_version: '>=1.3.0', + default_options: [ + 'warning_level=2', + 'cpp_std=c++17', + ], +) + +subdir('c++') + +install_data( + [ + 'LICENSE', + 'NOTICE', + ], + install_dir: 'share/doc/orc', +) + +if get_option('tools').enabled() + subdir('tools') +endif diff --git a/meson.options b/meson.options new file mode 100644 index 0000000000..c56f89e02b --- /dev/null +++ b/meson.options @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +option( + 'tests', + type: 'feature', + value: 'enabled', + description: 'Build the googletest unit tests', +) + +option( + 'tools', + type: 'feature', + value: 'enabled', + description: 'Build the tools', +) diff --git a/site/Dockerfile b/site/Dockerfile index ff0a613974..a2a26a285f 100644 --- a/site/Dockerfile +++ b/site/Dockerfile @@ -17,8 +17,11 @@ # ORC site builder # -FROM ubuntu:20.04 -LABEL maintainer="Apache ORC project " +FROM ubuntu:24.04 +LABEL org.opencontainers.image.authors="Apache ORC project " +LABEL org.opencontainers.image.licenses="Apache-2.0" +LABEL org.opencontainers.image.ref.name="Apache ORC site builder" +LABEL org.opencontainers.image.version="" RUN ln -fs /usr/share/zoneinfo/America/Los_Angeles /etc/localtime RUN apt-get update @@ -37,8 +40,8 @@ RUN gem install \ liquid \ listen \ rouge -RUN gem install jekyll -v 3.8.6 -RUN gem install github-pages +RUN gem install jekyll +RUN gem install -f github-pages RUN useradd -ms /bin/bash orc COPY . /home/orc/site diff --git a/site/Gemfile b/site/Gemfile index 1c529c9ce1..200c6ce7b2 100644 --- a/site/Gemfile +++ b/site/Gemfile @@ -1,3 +1,2 @@ source 'https://rubygems.org/' -gem 'rouge' -gem 'jekyll', "~> 3.8.3" +gem 'jekyll', "~> 4.3" diff --git a/site/README.md b/site/README.md index d77b39d4ab..5de23f9c11 100644 --- a/site/README.md +++ b/site/README.md @@ -9,7 +9,7 @@ the site is to use docker to use a standard environment. 1. `cd site` 2. `git clone git@github.com:apache/orc.git -b asf-site target` -## Run the docker container with the preview of the site. +## Run the docker container with the preview of the site 1. `docker run -d --name orc-container -p 4000:4000 -v $PWD:/home/orc/site apache/orc-dev:site` diff --git a/site/_data/releases.yml b/site/_data/releases.yml index e4f0fc3600..d9a7ef804b 100644 --- a/site/_data/releases.yml +++ b/site/_data/releases.yml @@ -1,16 +1,128 @@ # List the releases in reverse logical order # Only one release should be tagged latest +2.1.3: + date: 2025-05-09 + state: latest + tar: orc-2.1.3.tar.gz + signed-by: Dongjoon Hyun (34F0FC5C) + sha256: 75f3a876eb520ec8327c275ee179027427ee77dc8105d773c6a415031b9bd74e41213a8b8f0dafb0a32318b26450002e843d764f8b8e46479e16147675b4eaca + known-issues: + +2.1.2: + date: 2025-05-06 + state: archived + tar: orc-2.1.2.tar.gz + signed-by: Dongjoon Hyun (34F0FC5C) + sha256: 55451e65dea6ed42afb39fe33a88f9dcea8928dca0a0c9c23ef5545587810b4c + known-issues: + +2.1.1: + date: 2025-03-06 + state: archived + tar: orc-2.1.1.tar.gz + signed-by: Dongjoon Hyun (34F0FC5C) + sha256: 15af8baeee322bab0298559a14a09cf8c14cf2008e35d8a78d3cc8a4c98d1e59 + known-issues: + +2.1.0: + date: 2025-01-09 + state: archived + tar: orc-2.1.0.tar.gz + signed-by: William Hyun (DECDFA29) + sha256: 1ffac0228aa83f04a1b1cf2788a3af5953e82587ae3a77c41900e99f2557132d + known-issues: + +2.0.6: + date: 2025-07-07 + state: stable + tar: orc-2.0.6.tar.gz + signed-by: Dongjoon Hyun (34F0FC5C) + sha256: 81167d31d7ec51de3b2acbbdbecbfbff50d7e321aadd9d537f4931cc0c07e045 + known-issues: + +2.0.5: + date: 2025-05-06 + state: archived + tar: orc-2.0.5.tar.gz + signed-by: Dongjoon Hyun (34F0FC5C) + sha256: 35dc3ad801f632f46028c45a31474f2cc03de63d1c8d8124870525b3aa95982a + known-issues: + +2.0.4: + date: 2025-03-20 + state: archived + tar: orc-2.0.4.tar.gz + signed-by: Dongjoon Hyun (34F0FC5C) + sha256: 9525a76fae64a6da2a29adba36474c2ef863d042a394b78a9873d591649b7f0a + known-issues: + +2.0.3: + date: 2024-11-14 + state: archived + tar: orc-2.0.3.tar.gz + signed-by: Dongjoon Hyun (34F0FC5C) + sha256: 082cba862b5a8a0d14c225404d0b51cd8d1b64ca81b8f1e500322ce8922cb86d + known-issues: + +2.0.2: + date: 2024-08-15 + state: archived + tar: orc-2.0.2.tar.gz + signed-by: Dongjoon Hyun (34F0FC5C) + sha256: fabdee3e8acd64dae1e8b8987149a7188121b40b025de46d15cc9d0becee2279 + known-issues: + +2.0.1: + date: 2024-05-14 + state: archived + tar: orc-2.0.1.tar.gz + signed-by: William Hyun (DECDFA29) + sha256: 1ffac0228aa83f04a1b1cf2788a3af5953e82587ae3a77c41900e99f2557132d + known-issues: + 2.0.0: date: 2024-03-08 - state: latest + state: archived tar: orc-2.0.0.tar.gz signed-by: Dongjoon Hyun (34F0FC5C) sha256: 9107730919c29eb39efaff1b9e36166634d1d4d9477e5fee76bfd6a8fec317df known-issues: +1.9.7: + date: 2025-07-04 + state: stable + tar: orc-1.9.7.tar.gz + signed-by: Dongjoon Hyun (34F0FC5C) + sha256: 3b3b18f472f8edf3649051e17f012831a0eb8bc55fef3d8f5733d4911332b059 + known-issues: + +1.9.6: + date: 2025-05-06 + state: archived + tar: orc-1.9.6.tar.gz + signed-by: Gang Wu (8A461DF4) + sha256: 4442944f53b6b4d48f0b6a1938a8f7d1233a92864d7d588201225c8977371754 + known-issues: + +1.9.5: + date: 2024-11-14 + state: archived + tar: orc-1.9.5.tar.gz + signed-by: Dongjoon Hyun (34F0FC5C) + sha256: 6900b4e8a2e4e49275f4067bd0f838ad68330204305fd3f13a5ec519e9d71547 + known-issues: + +1.9.4: + date: 2024-07-16 + state: archived + tar: orc-1.9.4.tar.gz + signed-by: William Hyun (DECDFA29) + sha256: d9a6bcc00e07a6e54d81ce380134e495ed0fc0d9dc1988d4d52125c9def097fd + known-issues: + 1.9.3: date: 2024-03-20 - state: stable + state: archived tar: orc-1.9.3.tar.gz signed-by: Gang Wu (578F619B) sha256: f737d005d0c4deb65688ac3c0223ed530b0ba6258552555b2774dcdb77359b0f @@ -40,9 +152,41 @@ sha256: 0dca8bbccdb2ee87e59ba964933436beebd02ea78c4134424828a8127fbc4faa known-issues: +1.8.10: + date: 2025-06-26 + state: stable + tar: orc-1.8.10.tar.gz + signed-by: Dongjoon Hyun (34F0FC5C) + sha256: c204243c55d34d83a1577c347b5450eb58ece9e91f6a0eaab2842d9ed3b1503a + known-issues: + +1.8.9: + date: 2025-05-06 + state: archived + tar: orc-1.8.9.tar.gz + signed-by: Gang Wu (8A461DF4) + sha256: 66343dc6832beda96b118bd78e74b079b20e4fda756d98c498db92d8bfc4c639 + known-issues: + +1.8.8: + date: 2024-11-11 + state: archived + tar: orc-1.8.8.tar.gz + signed-by: Gang Wu (8A461DF4) + sha256: eca12a9139c0889d11ef1ecc8f273ccb0ef5d19df70d61cb732194d806db026b + known-issues: + +1.8.7: + date: 2024-04-14 + state: archived + tar: orc-1.8.7.tar.gz + signed-by: Dongjoon Hyun (34F0FC5C) + sha256: 57c9d12bf74b2752b1ce1039c15035c3b6f6531d865df962a99b3e079b3dfdb7 + known-issues: + 1.8.6: date: 2023-11-10 - state: stable + state: archived tar: orc-1.8.6.tar.gz signed-by: Dongjoon Hyun (34F0FC5C) sha256: 5675b18118df4dd7f86cc6ba859ed75b425ea1b7ddff805e1d671a17fd57d7f7 @@ -96,9 +240,17 @@ sha256: 859d78bfded98405c32ccb2847b565a57bcc53f473a74087c1f750aeb5932e62 known-issues: +1.7.11: + date: 2024-09-13 + state: archived + tar: orc-1.7.11.tar.gz + signed-by: Dongjoon Hyun (34F0FC5C) + sha256: ff62f0b882470529b3e2507daa4092ffdb34818c220abefb11cac443e5757236 + known-issues: + 1.7.10: date: 2023-11-10 - state: stable + state: archived tar: orc-1.7.10.tar.gz signed-by: Dongjoon Hyun (34F0FC5C) sha256: 85aef9368dc9bcdffaaf10010b66dfe053ce22f30b64854f63852248164686a3 diff --git a/site/_docs/building.md b/site/_docs/building.md index f1cc015eaa..d10be485c2 100644 --- a/site/_docs/building.md +++ b/site/_docs/building.md @@ -9,10 +9,11 @@ dockerUrl: https://github.com/apache/orc/blob/main/docker The C++ library is supported on the following operating systems: -* CentOS 7 -* Debian 10 to 12 -* MacOS 12 to 14 -* Ubuntu 20.04 to 24.04 +* MacOS 13 to 15 +* Debian 11 to 12 +* Ubuntu 22.04 to 24.04 +* Oracle Linux 8 to 9 +* Amazon Linux 2023 You'll want to install the usual set of developer tools, but at least: @@ -27,11 +28,11 @@ is in the docker subdirectory, for the list of packages required to build ORC: * [Debian 11]({{ page.dockerUrl }}/debian11/Dockerfile) * [Debian 12]({{ page.dockerUrl }}/debian12/Dockerfile) -* [Ubuntu 20]({{ page.dockerUrl }}/ubuntu20/Dockerfile) * [Ubuntu 22]({{ page.dockerUrl }}/ubuntu22/Dockerfile) * [Ubuntu 24]({{ page.dockerUrl }}/ubuntu24/Dockerfile) -* [Fedora 37]({{ page.dockerUrl }}/fedora37/Dockerfile) +* [Oracle Linux 8]({{ page.dockerUrl }}/oraclelinux8/Dockerfile) * [Oracle Linux 9]({{ page.dockerUrl }}/oraclelinux9/Dockerfile) +* [Amazon Linux 2023]({{ page.dockerUrl }}/amazonlinux23/Dockerfile) To build a normal release: diff --git a/site/_docs/dask.md b/site/_docs/dask.md index 7719e7d4cd..d443bfef9d 100644 --- a/site/_docs/dask.md +++ b/site/_docs/dask.md @@ -9,7 +9,7 @@ permalink: /docs/dask.html [Dask](https://dask.org) also supports Apache ORC. ``` -pip3 install "dask[dataframe]==2023.8.1" +pip3 install "dask[dataframe]==2025.5.1" pip3 install pandas ``` diff --git a/site/_docs/index.md b/site/_docs/index.md index 76addd410f..5d3e2ec2a9 100644 --- a/site/_docs/index.md +++ b/site/_docs/index.md @@ -37,4 +37,4 @@ are separated from each other so the reader can read just the columns that are required. For details on the specifics of the ORC format, please see the [ORC -format specification](/specification/). \ No newline at end of file +format specification]({{ site.baseurl }}/specification/). diff --git a/site/_docs/java-tools.md b/site/_docs/java-tools.md index f537201133..a3d546e007 100644 --- a/site/_docs/java-tools.md +++ b/site/_docs/java-tools.md @@ -142,6 +142,9 @@ equivalent to the Hive ORC File Dump command. `--backup-path ` : when used with --recover specifies the path where the recovered file is written (default: /tmp) +`--column-type` + : Print the column id, name and type of each column + `-d,--data` : Should the data be printed diff --git a/site/_docs/pyarrow.md b/site/_docs/pyarrow.md index fca23797fe..c082cc7a28 100644 --- a/site/_docs/pyarrow.md +++ b/site/_docs/pyarrow.md @@ -9,7 +9,7 @@ permalink: /docs/pyarrow.html [Apache Arrow](https://arrow.apache.org) project's [PyArrow](https://pypi.org/project/pyarrow/) is the recommended package. ``` -pip3 install pyarrow==13.0.0 +pip3 install pyarrow==20.0.0 pip3 install pandas ``` diff --git a/site/_includes/docs_ul.html b/site/_includes/docs_ul.html index 8e93fee854..a11fdbadb8 100644 --- a/site/_includes/docs_ul.html +++ b/site/_includes/docs_ul.html @@ -12,7 +12,7 @@ {% for p in site.docs %} {% if p.url == item_url %} -
  • {{ p.title }}
  • +
  • {{ p.title }}
  • {% break %} {% endif %} {% endfor %} diff --git a/site/_includes/header.html b/site/_includes/header.html index e6e4721cf9..04d5ebde21 100644 --- a/site/_includes/header.html +++ b/site/_includes/header.html @@ -5,9 +5,9 @@
    diff --git a/site/_includes/news_contents.html b/site/_includes/news_contents.html index 2748456741..85546b49c5 100644 --- a/site/_includes/news_contents.html +++ b/site/_includes/news_contents.html @@ -2,17 +2,17 @@