diff --git a/.asf.yaml b/.asf.yaml
index 14178a61c8..0b09389458 100644
--- a/.asf.yaml
+++ b/.asf.yaml
@@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-# https://cwiki.apache.org/confluence/display/INFRA/git+-+.asf.yaml+features
+# https://github.com/apache/infrastructure-asfyaml/blob/main/README.md
---
github:
description: "Apache ORC - the smallest, fastest columnar storage for Hadoop workloads"
@@ -24,12 +24,17 @@ github:
merge: false
squash: true
rebase: true
+ ghp_branch: main
+ ghp_path: /site
labels:
- apache
- orc
- java
- cpp
- big-data
+ protected_tags:
+ - "rel/*"
+ - "v*.*.*"
notifications:
pullrequests: issues@orc.apache.org
issues: issues@orc.apache.org
diff --git a/.clang-tidy b/.clang-tidy
index bd995bca54..b401f8948b 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -21,13 +21,14 @@ Checks: "-*,
CheckOptions:
[
+ { key: readability-identifier-naming.ParameterCase, value: "camelBack" },
+ { key: readability-identifier-naming.PrivateMemberCase, value: "camelBack"},
{ key: readability-identifier-naming.PrivateMemberSuffix, value: "_" },
{ key: readability-identifier-naming.ProtectedMemberSuffix, value: "" },
{ key: readability-identifier-naming.PublicMemberSuffix, value: "" },
- { key: readability-identifier-naming.ParameterCase, value: "camelBack" },
{ key: readability-identifier-naming.ParameterIgnoredRegexp, value: "^[a-zA-Z]$" },
]
WarningsAsErrors: ''
-HeaderFilterRegex: '.*'
+HeaderFilterRegex: '(orc/c\+\+/|orc/tools)'
FormatStyle: none
\ No newline at end of file
diff --git a/.github/.licenserc.yaml b/.github/.licenserc.yaml
index a66db6601f..a16671e9d6 100644
--- a/.github/.licenserc.yaml
+++ b/.github/.licenserc.yaml
@@ -22,5 +22,6 @@ header:
- 'NOTICE'
- '.clang-format'
- '.asf.yaml'
+ - '.nojekyll'
comment: on-failure
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index 8eddbcdea3..05a385618d 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -20,12 +20,9 @@ updates:
schedule:
interval: "weekly"
ignore:
- # Pin gson to 2.2.4 because of Hive
+ # Pin gson to 2.9.0 because of Hive
- dependency-name: "com.google.code.gson:gson"
- versions: "[2.3,)"
+ versions: "[2.9,1)"
# Pin jodd-core to 3.5.2
- dependency-name: "org.jodd:jodd-core"
versions: "[3.5.3,)"
- # Pin annotations to 17.0.0
- - dependency-name: "org.jetbrains.annotations"
- versions: "[17.0.1,)"
diff --git a/.github/lsan-suppressions.txt b/.github/lsan-suppressions.txt
new file mode 100644
index 0000000000..fc26ee8754
--- /dev/null
+++ b/.github/lsan-suppressions.txt
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Add specific leak suppressions here if needed
+# Format:
+# leak:SymbolName
+# leak:source_file.cc
diff --git a/.github/workflows/asan_test.yml b/.github/workflows/asan_test.yml
new file mode 100644
index 0000000000..6e7ac64fbb
--- /dev/null
+++ b/.github/workflows/asan_test.yml
@@ -0,0 +1,69 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+name: Address/Undefined Sanitizer Tests
+
+on:
+ pull_request:
+ paths-ignore:
+ - 'site/**'
+ - 'conan/**'
+ branches:
+ - main
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.event.number || github.sha }}
+ cancel-in-progress: true
+
+jobs:
+ asan-test:
+ name: "ASAN with ${{ matrix.compiler }} on Ubuntu"
+ runs-on: ubuntu-latest
+ strategy:
+ fail-fast: false
+ matrix:
+ compiler: [gcc, clang]
+ include:
+ - compiler: gcc
+ cc: gcc
+ cxx: g++
+ - compiler: clang
+ cc: clang
+ cxx: clang++
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v4
+ - name: Install dependencies
+ run: |
+ sudo apt-get update
+ sudo apt-get install -y build-essential cmake libpthread-stubs0-dev
+ - name: Configure and Build with ASAN and UBSAN
+ env:
+ CC: ${{ matrix.cc }}
+ CXX: ${{ matrix.cxx }}
+ run: |
+ mkdir -p build && cd build
+ cmake .. -DCMAKE_BUILD_TYPE=Debug -DENABLE_ASAN=ON -DENABLE_UBSAN=ON -DBUILD_ENABLE_AVX512=ON -DBUILD_CPP_ENABLE_METRICS=ON -DBUILD_JAVA=OFF
+ make
+ - name: Run Tests
+ working-directory: build
+ env:
+ ASAN_OPTIONS: detect_leaks=1:symbolize=1:strict_string_checks=1:halt_on_error=0:detect_container_overflow=0
+ LSAN_OPTIONS: suppressions=${{ github.workspace }}/.github/lsan-suppressions.txt
+ UBSAN_OPTIONS: print_stacktrace=1
+ run: |
+ ctest --output-on-failure
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index b0350193ba..750dec550c 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
name: Build and test
on:
@@ -30,11 +47,12 @@ jobs:
- debian11
- debian12
- ubuntu24
- - fedora37
+ - oraclelinux8
- oraclelinux9
+ - amazonlinux23
steps:
- name: Checkout
- uses: actions/checkout@v2
+ uses: actions/checkout@v4
- name: "Test"
run: |
cd docker
@@ -47,11 +65,12 @@ jobs:
fail-fast: false
matrix:
os:
- - ubuntu-20.04
- ubuntu-22.04
- - macos-12
+ - ubuntu-24.04
+ - ubuntu-24.04-arm
- macos-13
- macos-14
+ - macos-15
java:
- 17
- 21
@@ -61,71 +80,40 @@ jobs:
- os: ubuntu-22.04
java: 17
cxx: g++
+ - os: ubuntu-latest
+ java: 25-ea
env:
MAVEN_OPTS: -Xmx2g
MAVEN_SKIP_RC: true
steps:
- name: Checkout
- uses: actions/checkout@v2
- - name: Cache Maven local repository
- uses: actions/cache@v2
- with:
- path: ~/.m2/repository
- key: ${{ matrix.java }}-maven-${{ hashFiles('**/pom.xml') }}
- restore-keys: |
- ${{ matrix.java }}-maven-
+ uses: actions/checkout@v4
- name: Install Java ${{ matrix.java }}
- uses: actions/setup-java@v3
+ uses: actions/setup-java@v4
with:
distribution: zulu
java-version: ${{ matrix.java }}
+ cache: 'maven'
- name: "Test"
run: |
mkdir -p ~/.m2
- mkdir build
- cd build
- if [ "${{ matrix.os }}" = "ubuntu-20.04" ]; then
- cmake -DANALYZE_JAVA=ON -DCMAKE_CXX_COMPILER=${{ matrix.cxx }} -DSTOP_BUILD_ON_WARNING=OFF ..
+ if [ "${{ matrix.java }}" = "25-ea" ]; then
+ cd java
+ # JDK 25 Build
+ ./mvnw package -DskipTests
+ # JDK 25 Test: shims, core, tools modules
+ ./mvnw package --pl tools --am
else
+ mkdir build
+ cd build
cmake -DANALYZE_JAVA=ON -DOPENSSL_ROOT_DIR=`brew --prefix openssl@1.1` ..
+ make package test-out
fi
- make package test-out
- name: Step on failure
if: ${{ failure() }}
run: |
cat /home/runner/work/orc/orc/build/java/rat.txt
- windows:
- name: "C++ ${{ matrix.simd }} Test on Windows"
- runs-on: windows-2019
- strategy:
- fail-fast: false
- matrix:
- simd:
- - General
- - AVX512
- env:
- ORC_USER_SIMD_LEVEL: AVX512
- steps:
- - name: Checkout
- uses: actions/checkout@v2
- - name: Add msbuild to PATH
- uses: microsoft/setup-msbuild@v1.1
- with:
- msbuild-architecture: x64
- - name: "Test"
- shell: bash
- run: |
- mkdir build
- cd build
- if [ "${{ matrix.simd }}" = "General" ]; then
- cmake .. -G "Visual Studio 16 2019" -DCMAKE_BUILD_TYPE=Debug -DBUILD_LIBHDFSPP=OFF -DBUILD_TOOLS=OFF -DBUILD_JAVA=OFF
- else
- cmake .. -G "Visual Studio 16 2019" -DCMAKE_BUILD_TYPE=Debug -DBUILD_LIBHDFSPP=OFF -DBUILD_TOOLS=OFF -DBUILD_JAVA=OFF -DBUILD_ENABLE_AVX512=ON
- fi
- cmake --build . --config Debug
- ctest -C Debug --output-on-failure
-
simdUbuntu:
name: "SIMD programming using C++ intrinsic functions on ${{ matrix.os }}"
runs-on: ${{ matrix.os }}
@@ -140,7 +128,7 @@ jobs:
ORC_USER_SIMD_LEVEL: AVX512
steps:
- name: Checkout
- uses: actions/checkout@v2
+ uses: actions/checkout@v4
- name: "Test"
run: |
mkdir -p ~/.m2
@@ -150,16 +138,25 @@ jobs:
make package test-out
doc:
- name: "Javadoc generation"
- runs-on: ubuntu-20.04
+ name: "Markdown check and Javadoc generation"
+ runs-on: ubuntu-24.04
steps:
- name: Checkout
- uses: actions/checkout@v2
+ uses: actions/checkout@v4
+ with:
+ fetch-depth: 0
+ - name: Super-Linter
+ uses: super-linter/super-linter@12150456a73e248bdc94d0794898f94e23127c88
+ env:
+ DEFAULT_BRANCH: main
+ VALIDATE_MARKDOWN: true
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Install Java 17
- uses: actions/setup-java@v3
+ uses: actions/setup-java@v4
with:
distribution: zulu
java-version: 17
+ cache: 'maven'
- name: "javadoc"
run: |
mkdir -p ~/.m2
@@ -167,28 +164,40 @@ jobs:
./mvnw install -DskipTests
./mvnw javadoc:javadoc
- formatting-check:
- name: "C++ format check"
- runs-on: ubuntu-20.04
- strategy:
- matrix:
- path:
- - 'c++'
- - 'tools'
+ cpp-linter:
+ runs-on: ubuntu-24.04
steps:
- - uses: actions/checkout@v3
- - name: Run clang-format style check for C++ code
- uses: jidicula/clang-format-action@v4.9.0
- with:
- clang-format-version: '13'
- check-path: ${{ matrix.path }}
+ - uses: actions/checkout@v4
+ - name: Run build
+ run: |
+ mkdir build && cd build
+ cmake .. -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DBUILD_JAVA=OFF
+ cmake --build .
+ - uses: cpp-linter/cpp-linter-action@v2.13.3
+ id: linter
+ continue-on-error: true
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ with:
+ style: file
+ tidy-checks: file
+ files-changed-only: true
+ lines-changed-only: true
+ thread-comments: true
+ ignore: 'build|cmake_modules|conan|dev|docker|examples|java|site'
+ database: build
+ - name: Fail fast?!
+ if: steps.linter.outputs.checks-failed != 0
+ run: |
+ echo "some linter checks failed. ${{ steps.linter.outputs.checks-failed }}"
+ exit 1
license-check:
name: "License Check"
runs-on: ubuntu-latest
steps:
- name: Checkout repository
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
- name: Check license header
uses: apache/skywalking-eyes@main
env:
@@ -196,3 +205,53 @@ jobs:
with:
config: .github/.licenserc.yaml
+ macos-cpp-check:
+ name: "C++ Test on macOS"
+ strategy:
+ fail-fast: false
+ matrix:
+ version: [13, 14, 15]
+ runs-on: macos-${{ matrix.version }}
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v4
+ - name: Install dependencies
+ run: |
+ brew update
+ brew install protobuf
+ - name: Test
+ run: |
+ CMAKE_PREFIX_PATH=$(brew --prefix protobuf)
+ mkdir -p build
+ cd build
+ cmake .. -DBUILD_JAVA=OFF -DPROTOBUF_HOME=${CMAKE_PREFIX_PATH}
+ make package test-out
+
+ meson:
+ name: "Meson C++ configuration"
+ runs-on: ${{ matrix.os }}
+ strategy:
+ fail-fast: false
+ matrix:
+ os:
+ - ubuntu-22.04
+ - ubuntu-24.04
+ - ubuntu-24.04-arm
+ - macos-13
+ - macos-14
+ - macos-15
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v4
+ - uses: actions/setup-python@v5
+ with:
+ python-version: '3.x'
+ - name: Install meson
+ run: |
+ pip install --upgrade pip
+ pip install meson
+ - name: Test
+ run: |
+ meson setup build -Dbuildtype=release
+ meson compile -C build
+ meson test -C build
diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml
new file mode 100644
index 0000000000..52b2e1fc7b
--- /dev/null
+++ b/.github/workflows/pages.yml
@@ -0,0 +1,72 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+name: GitHub Pages deployment
+
+on:
+ push:
+ branches:
+ - main
+
+concurrency:
+ group: 'docs preview'
+ cancel-in-progress: false
+
+jobs:
+ docs:
+ name: Build and deploy documentation
+ runs-on: ubuntu-latest
+ permissions:
+ id-token: write
+ pages: write
+ environment:
+ name: github-pages # https://github.com/actions/deploy-pages/issues/271
+ if: github.repository == 'apache/orc'
+ steps:
+ - name: Checkout ORC repository
+ uses: actions/checkout@v4
+ with:
+ repository: apache/orc
+ ref: 'main'
+ - name: Install Java 17
+ uses: actions/setup-java@v4
+ with:
+ distribution: zulu
+ java-version: 17
+ - name: Install Ruby for documentation generation
+ uses: ruby/setup-ruby@v1
+ with:
+ ruby-version: '3.3'
+ bundler-cache: true
+ - name: Run documentation build
+ run: |
+ cd site
+ gem install bundler -n /usr/local/bin
+ bundle install --retry=100
+ git clone https://github.com/apache/orc.git -b asf-site target
+ bundle exec jekyll build -b /orc
+ - name: Setup Pages
+ uses: actions/configure-pages@v5
+ - name: Upload artifact
+ uses: actions/upload-pages-artifact@v3
+ with:
+ path: 'site/target'
+ - name: Deploy to GitHub Pages
+ id: deployment
+ uses: actions/deploy-pages@v4
diff --git a/.github/workflows/publish_snapshot.yml b/.github/workflows/publish_snapshot.yml
index 5a91bcbfc2..eb6d771238 100644
--- a/.github/workflows/publish_snapshot.yml
+++ b/.github/workflows/publish_snapshot.yml
@@ -10,7 +10,7 @@ jobs:
if: github.repository == 'apache/orc'
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@master
+ - uses: actions/checkout@v4
- uses: actions/setup-java@v3
with:
diff --git a/.gitignore b/.gitignore
index 2ff46e9694..3635e33bf2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,3 +14,6 @@ dependency-reduced-pom.xml
java/bench/data
*.swp
.cache/*
+subprojects/*
+!subprojects/packagefiles
+!subprojects/*.wrap
diff --git a/.markdownlint.yaml b/.markdownlint.yaml
new file mode 100644
index 0000000000..11c7a48ee6
--- /dev/null
+++ b/.markdownlint.yaml
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+MD013: false
diff --git a/.markdownlintignore b/.markdownlintignore
new file mode 100644
index 0000000000..3953a04ce3
--- /dev/null
+++ b/.markdownlintignore
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+site
diff --git a/.nojekyll b/.nojekyll
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1fb0e755d6..9d036aa8e9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,10 +27,11 @@ project(ORC C CXX)
# Version number of package
SET(CPACK_PACKAGE_VERSION_MAJOR "2")
-SET(CPACK_PACKAGE_VERSION_MINOR "1")
+SET(CPACK_PACKAGE_VERSION_MINOR "3")
SET(CPACK_PACKAGE_VERSION_PATCH "0-SNAPSHOT")
SET(ORC_VERSION "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}")
-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PROJECT_SOURCE_DIR}/cmake_modules")
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # For clang-tidy.
+list(PREPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules")
option (BUILD_JAVA
"Include ORC Java library in the build process"
@@ -42,7 +43,7 @@ option (ANALYZE_JAVA
option (BUILD_LIBHDFSPP
"Include LIBHDFSPP library in the build process"
- ON)
+ OFF)
option(BUILD_CPP_TESTS
"Build the googletest unit tests"
@@ -76,10 +77,22 @@ option(BUILD_ENABLE_AVX512
"Enable build with AVX512 at compile time"
OFF)
+option(ENABLE_ASAN
+ "Enable Address Sanitizer"
+ OFF)
+
option(ORC_PACKAGE_KIND
"Arbitrary string that identifies the kind of package"
"")
+option(ORC_ENABLE_CLANG_TOOLS
+ "Enable Clang tools"
+ OFF)
+
+option(ENABLE_UBSAN
+ "Enable Undefined Behavior Sanitizer"
+ OFF)
+
# Make sure that a build type is selected
if (NOT CMAKE_BUILD_TYPE)
message(STATUS "No build type selected, default to ReleaseWithDebugInfo")
@@ -151,17 +164,38 @@ elseif (MSVC)
set (WARN_FLAGS "${WARN_FLAGS} -wd4521") # multiple copy constructors specified
set (WARN_FLAGS "${WARN_FLAGS} -wd4146") # unary minus operator applied to unsigned type, result still unsigned
endif ()
+# Configure Address Sanitizer if enabled
+if (ENABLE_ASAN)
+ if (CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
+ message(STATUS "Address Sanitizer enabled")
+ else()
+ message(WARNING "Address Sanitizer is only supported for GCC and Clang compilers")
+ endif()
+endif()
-if (BUILD_CPP_ENABLE_METRICS)
- message(STATUS "Enable the metrics collection")
- add_compile_definitions(ENABLE_METRICS=1)
-else ()
- message(STATUS "Disable the metrics collection")
- add_compile_definitions(ENABLE_METRICS=0)
-endif ()
+# Configure Undefined Behavior Sanitizer if enabled
+if (ENABLE_UBSAN)
+ if (CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined -fno-sanitize=alignment,vptr,function -fno-sanitize-recover=all")
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=undefined -fno-sanitize=alignment,vptr,function -fno-sanitize-recover=all")
+ message(STATUS "Undefined Behavior Sanitizer enabled")
+ else()
+ message(WARNING "Undefined Behavior Sanitizer is only supported for GCC and Clang compilers")
+ endif()
+endif()
enable_testing()
+INCLUDE(GNUInstallDirs) # Put it before ThirdpartyToolchain to make CMAKE_INSTALL_LIBDIR available.
+
+if (ORC_PACKAGE_KIND STREQUAL "vcpkg")
+ set(ORC_INSTALL_CMAKE_DIR ${CMAKE_INSTALL_DATAROOTDIR}/orc)
+else ()
+ set(ORC_INSTALL_CMAKE_DIR ${CMAKE_INSTALL_LIBDIR}/cmake/orc)
+endif ()
+
INCLUDE(CheckSourceCompiles)
INCLUDE(ThirdpartyToolchain)
@@ -180,7 +214,7 @@ if (BUILD_ENABLE_AVX512 AND NOT APPLE)
INCLUDE(ConfigSimdLevel)
endif ()
-set (EXAMPLE_DIRECTORY ${CMAKE_SOURCE_DIR}/examples)
+set (EXAMPLE_DIRECTORY ${PROJECT_SOURCE_DIR}/examples)
add_subdirectory(c++)
@@ -210,3 +244,7 @@ if (BUILD_CPP_TESTS)
)
endif ()
endif ()
+
+if (ORC_ENABLE_CLANG_TOOLS)
+ INCLUDE(CheckFormat)
+endif ()
diff --git a/README.md b/README.md
index 60b0da5fcb..2ddf0849b9 100644
--- a/README.md
+++ b/README.md
@@ -18,20 +18,21 @@ lists, maps, and unions.
This project includes both a Java library and a C++ library for reading and writing the _Optimized Row Columnar_ (ORC) file format. The C++ and Java libraries are completely independent of each other and will each read all versions of ORC files.
Releases:
-* Latest: Apache ORC releases
-* Maven Central: 
-* Downloads: Apache ORC downloads
-* Release tags: Apache ORC release tags
-* Plan: Apache ORC future release plan
+
+* Latest: [Apache ORC releases](https://orc.apache.org/releases)
+* Maven Central: [](https://search.maven.org/#search%7Cga%7C1%7Cg%3A%22org.apache.orc%22)
+* Downloads: [Apache ORC downloads](https://orc.apache.org/downloads)
+* Release tags: [Apache ORC release tags](https://github.com/apache/orc/releases)
+* Plan: [Apache ORC future release plan](https://github.com/apache/orc/milestones)
The current build status:
-* Main branch
-
-Bug tracking: Apache Jira
+* Main branch [](https://github.com/apache/orc/actions/workflows/build_and_test.yml?query=branch%3Amain)
+Bug tracking: [Apache Jira](https://orc.apache.org/bugs)
The subdirectories are:
+
* c++ - the c++ reader and writer
* cmake_modules - the cmake modules
* docker - docker scripts to build and test on various linuxes
@@ -43,10 +44,11 @@ The subdirectories are:
### Building
* Install java 17 or higher
-* Install maven 3.9.6 or higher
+* Install maven 3.9.9 or higher
* Install cmake 3.12 or higher
To build a release version with debug information:
+
```shell
% mkdir build
% cd build
@@ -57,6 +59,7 @@ To build a release version with debug information:
```
To build a debug version:
+
```shell
% mkdir build
% cd build
@@ -67,6 +70,7 @@ To build a debug version:
```
To build a release version without debug information:
+
```shell
% mkdir build
% cd build
@@ -77,6 +81,7 @@ To build a release version without debug information:
```
To build only the Java library:
+
```shell
% cd java
% ./mvnw package
@@ -84,6 +89,7 @@ To build only the Java library:
```
To build only the C++ library:
+
```shell
% mkdir build
% cd build
@@ -94,6 +100,7 @@ To build only the C++ library:
```
To build the C++ library with AVX512 enabled:
+
```shell
export ORC_USER_SIMD_LEVEL=AVX512
% mkdir build
@@ -102,8 +109,49 @@ export ORC_USER_SIMD_LEVEL=AVX512
% make package
% make test-out
```
+
Cmake option BUILD_ENABLE_AVX512 can be set to "ON" or (default value)"OFF" at the compile time. At compile time, it defines the SIMD level(AVX512) to be compiled into the binaries.
Environment variable ORC_USER_SIMD_LEVEL can be set to "AVX512" or (default value)"NONE" at the run time. At run time, it defines the SIMD level to dispatch the code which can apply SIMD optimization.
Note that if ORC_USER_SIMD_LEVEL is set to "NONE" at run time, AVX512 will not take effect at run time even if BUILD_ENABLE_AVX512 is set to "ON" at compile time.
+
+### Building with Meson
+
+While CMake is the official build system for orc, there is unofficial support for using Meson to build select parts of the project. To build a debug version of the library and test it using Meson, from the project root you can run:
+
+```shell
+meson setup build
+meson compile -C build
+meson test -C build
+```
+
+By default, Meson will build unoptimized libraries with debug symbols. By contrast, the CMake build system generates release libraries by default. If you would like to create release libraries ala CMake, you should set the buildtype option. You must either remove the existing build directory before changing that setting, or alternatively pass the ``--reconfigure`` flag:
+
+```shell
+meson setup build -Dbuildtype=release --reconfigure
+meson compile -C build
+meson test -C build
+```
+
+Meson supports running your test suite through valgrind out of the box:
+
+```shell
+meson test -C build --wrap=valgrind
+```
+
+If you'd like to enable sanitizers, you can leverage the ``-Db_sanitize=`` option. For example, to enable both ASAN and UBSAN, you can run:
+
+```shell
+meson setup build -Dbuildtype=debug -Db_sanitize=address,undefined --reconfigure
+meson compile -C build
+meson test
+```
+
+Meson takes care of detecting all dependencies on your system, and downloading missing ones as required through its [Wrap system](https://mesonbuild.com/Wrap-dependency-system-manual.html). The dependencies for the project are all stored in the ``subprojects`` directory in individual wrap files. The majority of these are system generated files created by running:
+
+```shell
+meson wrap install
+```
+
+From the project root. If you are developing orc and need to add a new dependency in the future, be sure to check Meson's [WrapDB](https://mesonbuild.com/Wrapdb-projects.html) to check if a pre-configured wrap entry exists. If not, you may still manually configure the dependency as outlined in the aforementioned Wrap system documentation.
diff --git a/c++/CMakeLists.txt b/c++/CMakeLists.txt
index 449bd10f3e..38c38f7ce4 100644
--- a/c++/CMakeLists.txt
+++ b/c++/CMakeLists.txt
@@ -15,14 +15,23 @@
# specific language governing permissions and limitations
# under the License.
-include_directories (
- ${CMAKE_CURRENT_BINARY_DIR}/include
- "include"
- )
-
add_subdirectory(include)
add_subdirectory(src)
if (BUILD_CPP_TESTS)
add_subdirectory(test)
endif ()
+
+# Generate cmake package configuration files
+include(CMakePackageConfigHelpers)
+configure_package_config_file(
+ orcConfig.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/orcConfig.cmake
+ INSTALL_DESTINATION ${ORC_INSTALL_CMAKE_DIR})
+write_basic_package_version_file(
+ ${CMAKE_CURRENT_BINARY_DIR}/orcConfigVersion.cmake
+ VERSION ${ORC_VERSION}
+ COMPATIBILITY SameMajorVersion)
+install(FILES
+ ${CMAKE_CURRENT_BINARY_DIR}/orcConfig.cmake
+ ${CMAKE_CURRENT_BINARY_DIR}/orcConfigVersion.cmake
+ DESTINATION ${ORC_INSTALL_CMAKE_DIR})
diff --git a/c++/build-support/README.md b/c++/build-support/README.md
new file mode 100644
index 0000000000..80966104bb
--- /dev/null
+++ b/c++/build-support/README.md
@@ -0,0 +1,30 @@
+# Build support
+
+The Python scripts under the folder provide capabilities for formatting code.
+Make sure you've installed `clang-format-13`, `clang-tidy-13` and `clang-apply-replacements-13` and cmake could find them.
+We enforce the version of tools because different versions of tools may generate different results.
+
+## clang-format
+
+To use `run_clang_format.py` you could act like below:
+
+```shell
+mkdir build
+cd build
+cmake .. -DBUILD_JAVA=OFF -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang -DCMAKE_EXPORT_COMPILE_COMMANDS=1 -DORC_ENABLE_CLANG_TOOLS=1
+make check-format # Do checks only
+make format # This would apply suggested changes, take care!
+```
+
+## clang-tidy
+
+To use `run_clang_tidy.py` you could act like below:
+
+```shell
+mkdir build
+cd build
+cmake .. -DBUILD_JAVA=OFF -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang -DCMAKE_EXPORT_COMPILE_COMMANDS=1 -DORC_ENABLE_CLANG_TOOLS=1
+make -j`nproc` # Important
+make check-clang-tidy # Do checks only
+make fix-clang-tidy # This would apply suggested changes, take care!
+```
diff --git a/c++/build-support/run_clang_format.py b/c++/build-support/run_clang_format.py
new file mode 100644
index 0000000000..52d2e6b255
--- /dev/null
+++ b/c++/build-support/run_clang_format.py
@@ -0,0 +1,132 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import argparse
+import codecs
+import difflib
+import fnmatch
+import os
+import subprocess
+import sys
+
+
+def check(arguments, source_dir):
+ formatted_filenames = []
+ error = False
+ for directory, subdirs, filenames in os.walk(source_dir):
+ fullpaths = (os.path.join(directory, filename)
+ for filename in filenames)
+ source_files = [x for x in fullpaths
+ if x.endswith(".hh") or x.endswith(".cc")]
+ formatted_filenames.extend(
+ # Filter out files that match the globs in the globs file
+ [filename for filename in source_files
+ if not any((fnmatch.fnmatch(filename, exclude_glob)
+ for exclude_glob in exclude_globs))])
+
+ if arguments.fix:
+ if not arguments.quiet:
+ # Print out each file on its own line, but run
+ # clang format once for all of the files
+ print("\n".join(map(lambda x: "Formatting {}".format(x),
+ formatted_filenames)))
+ subprocess.check_call([arguments.clang_format_binary,
+ "-i"] + formatted_filenames)
+ else:
+ for filename in formatted_filenames:
+ if not arguments.quiet:
+ print("Checking {}".format(filename))
+ #
+ # Due to some incompatibilities between Python 2 and
+ # Python 3, there are some specific actions we take here
+ # to make sure the difflib.unified_diff call works.
+ #
+ # In Python 2, the call to subprocess.check_output return
+ # a 'str' type. In Python 3, however, the call returns a
+ # 'bytes' type unless the 'encoding' argument is
+ # specified. Unfortunately, the 'encoding' argument is not
+ # in the Python 2 API. We could do an if/else here based
+ # on the version of Python we are running, but it's more
+ # straightforward to read the file in binary and do utf-8
+ # conversion. In Python 2, it's just converting string
+ # types to unicode types, whereas in Python 3 it's
+ # converting bytes types to utf-8 encoded str types. This
+ # approach ensures that the arguments to
+ # difflib.unified_diff are acceptable string types in both
+ # Python 2 and Python 3.
+ with open(filename, "rb") as reader:
+ # Run clang-format and capture its output
+ formatted = subprocess.check_output(
+ [arguments.clang_format_binary,
+ filename])
+ formatted = codecs.decode(formatted, "utf-8")
+ # Read the original file
+ original = codecs.decode(reader.read(), "utf-8")
+ # Run the equivalent of diff -u
+ diff = list(difflib.unified_diff(
+ original.splitlines(True),
+ formatted.splitlines(True),
+ fromfile=filename,
+ tofile="{} (after clang format)".format(
+ filename)))
+ if diff:
+ print("{} had clang-format style issues".format(filename))
+ # Print out the diff to stderr
+ error = True
+ sys.stderr.writelines(diff)
+ return error
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ description="Runs clang format on all of the source "
+ "files. If --fix is specified, and compares the output "
+ "with the existing file, outputting a unifiied diff if "
+ "there are any necessary changes")
+ parser.add_argument("clang_format_binary",
+ help="Path to the clang-format binary")
+ parser.add_argument("--exclude_globs",
+ help="Filename containing globs for files "
+ "that should be excluded from the checks")
+ parser.add_argument("--source_dirs",
+ help="Comma-separated root directories of the code")
+ parser.add_argument("--fix", default=False,
+ action="http://github.com/store_true",
+ help="If specified, will re-format the source "
+ "code instead of comparing the re-formatted "
+ "output, defaults to %(default)s")
+ parser.add_argument("--quiet", default=False,
+ action="http://github.com/store_true",
+ help="If specified, only print errors")
+
+ args = parser.parse_args()
+
+ had_err = False
+ exclude_globs = []
+ if args.exclude_globs:
+ for line in open(args.exclude_globs):
+ if line.strip() == "":
+ continue
+ if line[0] == "#":
+ continue
+ exclude_globs.append(line.strip())
+
+ for source_dir in args.source_dirs.split(','):
+ if len(source_dir) > 0:
+ had_err = had_err or check(args, source_dir)
+
+ sys.exit(1 if had_err else 0)
\ No newline at end of file
diff --git a/run_clang_tidy.py b/c++/build-support/run_clang_tidy.py
old mode 100644
new mode 100755
similarity index 100%
rename from run_clang_tidy.py
rename to c++/build-support/run_clang_tidy.py
diff --git a/c++/include/CMakeLists.txt b/c++/include/CMakeLists.txt
index 056d1b9fab..a9f8b4a3b5 100644
--- a/c++/include/CMakeLists.txt
+++ b/c++/include/CMakeLists.txt
@@ -22,10 +22,11 @@ configure_file (
install(FILES
"${CMAKE_CURRENT_BINARY_DIR}/orc/orc-config.hh"
- DESTINATION "include/orc"
+ DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/orc"
)
install(DIRECTORY
- "orc/"
- DESTINATION "include/orc"
- FILES_MATCHING PATTERN "*.hh")
+ "orc/"
+ DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/orc"
+ FILES_MATCHING PATTERN "*.hh"
+ )
diff --git a/c++/include/orc/Exceptions.hh b/c++/include/orc/Exceptions.hh
index 97cf5d8a0d..b19a00760c 100644
--- a/c++/include/orc/Exceptions.hh
+++ b/c++/include/orc/Exceptions.hh
@@ -67,6 +67,18 @@ namespace orc {
SchemaEvolutionError(const SchemaEvolutionError&);
SchemaEvolutionError& operator=(const SchemaEvolutionError&) = delete;
};
+
+ class CompressionError : public std::runtime_error {
+ public:
+ explicit CompressionError(const std::string& whatArg);
+ explicit CompressionError(const char* whatArg);
+ ~CompressionError() noexcept override;
+ CompressionError(const CompressionError&);
+
+ private:
+ CompressionError& operator=(const CompressionError&);
+ };
+
} // namespace orc
#endif
diff --git a/c++/include/orc/Geospatial.hh b/c++/include/orc/Geospatial.hh
new file mode 100644
index 0000000000..d3b9e28285
--- /dev/null
+++ b/c++/include/orc/Geospatial.hh
@@ -0,0 +1,196 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This file contains code adapted from the Apache Arrow project.
+ *
+ * Original source:
+ * https://github.com/apache/arrow/blob/main/cpp/src/parquet/geospatial/statistics.h
+ *
+ * The original code is licensed under the Apache License, Version 2.0.
+ *
+ * Modifications may have been made from the original source.
+ */
+
+#ifndef ORC_GEOSPATIAL_HH
+#define ORC_GEOSPATIAL_HH
+
+#include
+#include
+#include
+#include
+
+namespace orc::geospatial {
+
+ constexpr double INF = std::numeric_limits::infinity();
+ // The maximum number of dimensions supported (X, Y, Z, M)
+ inline constexpr int MAX_DIMENSIONS = 4;
+
+ // Supported combinations of geometry dimensions
+ enum class Dimensions {
+ XY = 0, // X and Y only
+ XYZ = 1, // X, Y, and Z
+ XYM = 2, // X, Y, and M
+ XYZM = 3, // X, Y, Z, and M
+ VALUE_MIN = 0,
+ VALUE_MAX = 3
+ };
+
+ // Supported geometry types according to ISO WKB
+ enum class GeometryType {
+ POINT = 1,
+ LINESTRING = 2,
+ POLYGON = 3,
+ MULTIPOINT = 4,
+ MULTILINESTRING = 5,
+ MULTIPOLYGON = 6,
+ GEOMETRYCOLLECTION = 7,
+ VALUE_MIN = 1,
+ VALUE_MAX = 7
+ };
+
+ // BoundingBox represents the minimum bounding rectangle (or box) for a geometry.
+ // It supports up to 4 dimensions (X, Y, Z, M).
+ struct BoundingBox {
+ using XY = std::array;
+ using XYZ = std::array;
+ using XYM = std::array;
+ using XYZM = std::array;
+
+ // Default constructor: initializes to an empty bounding box.
+ BoundingBox() : min{INF, INF, INF, INF}, max{-INF, -INF, -INF, -INF} {}
+ // Constructor with explicit min/max values.
+ BoundingBox(const XYZM& mins, const XYZM& maxes) : min(mins), max(maxes) {}
+ BoundingBox(const BoundingBox& other) = default;
+ BoundingBox& operator=(const BoundingBox&) = default;
+
+ // Update the bounding box to include a 2D coordinate.
+ void updateXY(const XY& coord) {
+ updateInternal(coord);
+ }
+ // Update the bounding box to include a 3D coordinate (XYZ).
+ void updateXYZ(const XYZ& coord) {
+ updateInternal(coord);
+ }
+ // Update the bounding box to include a 3D coordinate (XYM).
+ void updateXYM(const XYM& coord) {
+ std::array dims = {0, 1, 3};
+ for (int i = 0; i < 3; ++i) {
+ auto dim = dims[i];
+ if (!std::isnan(min[dim]) && !std::isnan(max[dim])) {
+ min[dim] = std::min(min[dim], coord[i]);
+ max[dim] = std::max(max[dim], coord[i]);
+ }
+ }
+ }
+ // Update the bounding box to include a 4D coordinate (XYZM).
+ void updateXYZM(const XYZM& coord) {
+ updateInternal(coord);
+ }
+
+ // Reset the bounding box to its initial empty state.
+ void reset() {
+ for (int i = 0; i < MAX_DIMENSIONS; ++i) {
+ min[i] = INF;
+ max[i] = -INF;
+ }
+ }
+
+ // Invalidate the bounding box (set all values to NaN).
+ void invalidate() {
+ for (int i = 0; i < MAX_DIMENSIONS; ++i) {
+ min[i] = std::numeric_limits::quiet_NaN();
+ max[i] = std::numeric_limits::quiet_NaN();
+ }
+ }
+
+ // Check if the bound for a given dimension is empty.
+ bool boundEmpty(int dim) const {
+ return std::isinf(min[dim] - max[dim]);
+ }
+
+ // Check if the bound for a given dimension is valid (not NaN).
+ bool boundValid(int dim) const {
+ return !std::isnan(min[dim]) && !std::isnan(max[dim]);
+ }
+
+ // Get the lower bound (min values).
+ const XYZM& lowerBound() const {
+ return min;
+ }
+ // Get the upper bound (max values).
+ const XYZM& upperBound() const {
+ return max;
+ }
+
+ // Get validity for each dimension.
+ std::array dimensionValid() const {
+ return {boundValid(0), boundValid(1), boundValid(2), boundValid(3)};
+ }
+ // Get emptiness for each dimension.
+ std::array dimensionEmpty() const {
+ return {boundEmpty(0), boundEmpty(1), boundEmpty(2), boundEmpty(3)};
+ }
+
+ // Merge another bounding box into this one.
+ void merge(const BoundingBox& other) {
+ for (int i = 0; i < MAX_DIMENSIONS; ++i) {
+ if (std::isnan(min[i]) || std::isnan(max[i]) || std::isnan(other.min[i]) ||
+ std::isnan(other.max[i])) {
+ min[i] = std::numeric_limits::quiet_NaN();
+ max[i] = std::numeric_limits::quiet_NaN();
+ } else {
+ min[i] = std::min(min[i], other.min[i]);
+ max[i] = std::max(max[i], other.max[i]);
+ }
+ }
+ }
+
+ // Convert the bounding box to a string representation.
+ std::string toString() const;
+
+ XYZM min; // Minimum values for each dimension
+ XYZM max; // Maximum values for each dimension
+
+ private:
+ // Internal update function for XY, XYZ, or XYZM coordinates.
+ template
+ void updateInternal(const Coord& coord) {
+ for (size_t i = 0; i < coord.size(); ++i) {
+ if (!std::isnan(min[i]) && !std::isnan(max[i])) {
+ min[i] = std::min(min[i], coord[i]);
+ max[i] = std::max(max[i], coord[i]);
+ }
+ }
+ }
+ };
+
+ inline bool operator==(const BoundingBox& lhs, const BoundingBox& rhs) {
+ return lhs.min == rhs.min && lhs.max == rhs.max;
+ }
+ inline bool operator!=(const BoundingBox& lhs, const BoundingBox& rhs) {
+ return !(lhs == rhs);
+ }
+ inline std::ostream& operator<<(std::ostream& os, const BoundingBox& obj) {
+ os << obj.toString();
+ return os;
+ }
+
+} // namespace orc::geospatial
+
+#endif // ORC_GEOSPATIAL_HH
diff --git a/c++/include/orc/Int128.hh b/c++/include/orc/Int128.hh
index 6954c771cf..e728e70e7b 100644
--- a/c++/include/orc/Int128.hh
+++ b/c++/include/orc/Int128.hh
@@ -193,43 +193,13 @@ namespace orc {
* Shift left by the given number of bits.
* Values larger than 2**127 will shift into the sign bit.
*/
- Int128& operator<<=(uint32_t bits) {
- if (bits != 0) {
- if (bits < 64) {
- highbits_ <<= bits;
- highbits_ |= (lowbits_ >> (64 - bits));
- lowbits_ <<= bits;
- } else if (bits < 128) {
- highbits_ = static_cast(lowbits_) << (bits - 64);
- lowbits_ = 0;
- } else {
- highbits_ = 0;
- lowbits_ = 0;
- }
- }
- return *this;
- }
+ Int128& operator<<=(uint32_t bits);
/**
* Shift right by the given number of bits. Negative values will
* sign extend and fill with one bits.
*/
- Int128& operator>>=(uint32_t bits) {
- if (bits != 0) {
- if (bits < 64) {
- lowbits_ >>= bits;
- lowbits_ |= static_cast(highbits_ << (64 - bits));
- highbits_ = static_cast(static_cast(highbits_) >> bits);
- } else if (bits < 128) {
- lowbits_ = static_cast(highbits_ >> (bits - 64));
- highbits_ = highbits_ >= 0 ? 0 : -1l;
- } else {
- highbits_ = highbits_ >= 0 ? 0 : -1l;
- lowbits_ = static_cast(highbits_);
- }
- }
- return *this;
- }
+ Int128& operator>>=(uint32_t bits);
bool operator==(const Int128& right) const {
return highbits_ == right.highbits_ && lowbits_ == right.lowbits_;
diff --git a/c++/include/orc/OrcFile.hh b/c++/include/orc/OrcFile.hh
index 6e4a07bf7c..ea71567c5f 100644
--- a/c++/include/orc/OrcFile.hh
+++ b/c++/include/orc/OrcFile.hh
@@ -19,6 +19,7 @@
#ifndef ORC_FILE_HH
#define ORC_FILE_HH
+#include
#include
#include "orc/Reader.hh"
@@ -58,6 +59,18 @@ namespace orc {
*/
virtual void read(void* buf, uint64_t length, uint64_t offset) = 0;
+ /**
+ * Read data asynchronously into the buffer. The buffer is allocated by the caller.
+ * @param buf the buffer to read into
+ * @param length the number of bytes to read.
+ * @param offset the position in the stream to read from.
+ * @return a future that will be set when the read is complete.
+ */
+ virtual std::future readAsync(void* buf, uint64_t length, uint64_t offset) {
+ return std::async(std::launch::async,
+ [this, buf, length, offset] { this->read(buf, length, offset); });
+ }
+
/**
* Get the name of the stream for error messages.
*/
@@ -127,8 +140,8 @@ namespace orc {
* @param path the uri of the file in HDFS
* @param metrics the metrics of the reader
*/
- std::unique_ptr readHdfsFile(const std::string& path,
- ReaderMetrics* metrics = nullptr);
+ [[deprecated("readHdfsFile is deprecated in 2.0.1")]] std::unique_ptr readHdfsFile(
+ const std::string& path, ReaderMetrics* metrics = nullptr);
/**
* Create a reader to read the ORC file.
diff --git a/c++/include/orc/Reader.hh b/c++/include/orc/Reader.hh
index 4b254593ee..e9f420f113 100644
--- a/c++/include/orc/Reader.hh
+++ b/c++/include/orc/Reader.hh
@@ -40,6 +40,17 @@ namespace orc {
struct ReaderOptionsPrivate;
struct RowReaderOptionsPrivate;
+ struct CacheOptions {
+ // The maximum distance in bytes between two consecutive
+ // ranges; beyond this value, ranges are not combined
+ uint64_t holeSizeLimit = 8192;
+
+ // The maximum size in bytes of a combined range; if
+ // combining two consecutive ranges would produce a range of a
+ // size greater than this, they are not combined
+ uint64_t rangeSizeLimit = 32 * 1024 * 1024;
+ };
+
/**
* Expose the reader metrics including the latency and
* number of calls of the decompression/decoding/IO modules.
@@ -59,9 +70,20 @@ namespace orc {
std::atomic IOBlockingLatencyUs{0};
std::atomic SelectedRowGroupCount{0};
std::atomic EvaluatedRowGroupCount{0};
+ std::atomic ReadRangeCacheHits{0};
+ std::atomic ReadRangeCacheMisses{0};
};
ReaderMetrics* getDefaultReaderMetrics();
+ // Row group index of a single column in a stripe.
+ struct RowGroupIndex {
+ // Positions are represented as a two-dimensional array where the first
+ // dimension is row group index and the second dimension is the position
+ // list of the row group. The size of the second dimension should be equal
+ // among all row groups.
+ std::vector> positions;
+ };
+
/**
* Options for creating a Reader.
*/
@@ -107,6 +129,11 @@ namespace orc {
*/
ReaderOptions& setReaderMetrics(ReaderMetrics* metrics);
+ /**
+ * Set the cache options.
+ */
+ ReaderOptions& setCacheOptions(const CacheOptions& cacheOptions);
+
/**
* Set the location of the tail as defined by the logical length of the
* file.
@@ -138,6 +165,11 @@ namespace orc {
* Get the reader metrics.
*/
ReaderMetrics* getReaderMetrics() const;
+
+ /**
+ * Set the cache options.
+ */
+ const CacheOptions& getCacheOptions() const;
};
/**
@@ -466,9 +498,11 @@ namespace orc {
/**
* Get the statistics about a stripe.
* @param stripeIndex the index of the stripe (0 to N-1) to get statistics about
- * @return the statistics about that stripe
+ * @param includeRowIndex whether the row index of the stripe is included
+ * @return the statistics about that stripe and row group index statistics
*/
- virtual std::unique_ptr getStripeStatistics(uint64_t stripeIndex) const = 0;
+ virtual std::unique_ptr getStripeStatistics(
+ uint64_t stripeIndex, bool includeRowIndex = true) const = 0;
/**
* Get the length of the data stripes in the file.
@@ -605,6 +639,33 @@ namespace orc {
*/
virtual std::map getBloomFilters(
uint32_t stripeIndex, const std::set& included) const = 0;
+
+ /**
+ * Get row group index of all selected columns in the specified stripe
+ * @param stripeIndex index of the stripe to be read for row group index.
+ * @param included index of selected columns to return (if not specified,
+ * all columns will be returned).
+ * @return map of row group index keyed by its column index.
+ */
+ virtual std::map getRowGroupIndex(
+ uint32_t stripeIndex, const std::set& included = {}) const = 0;
+
+ /**
+ * Trigger IO prefetch and cache the prefetched contents asynchronously.
+ * It is thread safe. Users should make sure requested stripes and columns
+ * are not overlapped, otherwise the overlapping part will be prefetched multiple time,
+ * which doesn't affect correctness but waste IO and memory resources.
+ * @param stripes the stripes to prefetch
+ * @param includeTypes the types to prefetch
+ */
+ virtual void preBuffer(const std::vector& stripes,
+ const std::list& includeTypes) = 0;
+
+ /**
+ * Release cached entries whose right boundary is less than or equal to the given boundary.
+ * @param boundary the boundary value to release cache entries
+ */
+ virtual void releaseBuffer(uint64_t boundary) = 0;
};
/**
diff --git a/c++/include/orc/Statistics.hh b/c++/include/orc/Statistics.hh
index 4ba8c35f7d..58169abe59 100644
--- a/c++/include/orc/Statistics.hh
+++ b/c++/include/orc/Statistics.hh
@@ -19,12 +19,11 @@
#ifndef ORC_STATISTICS_HH
#define ORC_STATISTICS_HH
+#include "orc/Geospatial.hh"
#include "orc/Type.hh"
#include "orc/Vector.hh"
#include "orc/orc-config.hh"
-#include
-
namespace orc {
/**
@@ -367,6 +366,33 @@ namespace orc {
virtual int32_t getMaximumNanos() const = 0;
};
+ /**
+ * Statistics for Geometry and Geography
+ */
+ class GeospatialColumnStatistics : public ColumnStatistics {
+ public:
+ virtual ~GeospatialColumnStatistics();
+
+ /**
+ * Get bounding box
+ * @return bounding box
+ */
+ virtual const geospatial::BoundingBox& getBoundingBox() const = 0;
+
+ /**
+ * Get geospatial types
+ * @return a sorted vector of geometry type IDs that elements is unique
+ */
+ virtual std::vector getGeospatialTypes() const = 0;
+
+ /**
+ * Update stats by a new value
+ * @param value new value to update
+ * @param length length of the value
+ */
+ virtual void update(const char* value, size_t length) = 0;
+ };
+
class Statistics {
public:
virtual ~Statistics();
diff --git a/c++/include/orc/Type.hh b/c++/include/orc/Type.hh
index 82e0e3cc86..4bb794ff34 100644
--- a/c++/include/orc/Type.hh
+++ b/c++/include/orc/Type.hh
@@ -25,6 +25,18 @@
namespace orc {
+ namespace geospatial {
+ enum EdgeInterpolationAlgorithm {
+ SPHERICAL = 0,
+ VINCENTY = 1,
+ THOMAS = 2,
+ ANDOYER = 3,
+ KARNEY = 4
+ };
+ std::string AlgoToString(EdgeInterpolationAlgorithm algo);
+ EdgeInterpolationAlgorithm AlgoFromString(const std::string& algo);
+ } // namespace geospatial
+
enum TypeKind {
BOOLEAN = 0,
BYTE = 1,
@@ -44,7 +56,9 @@ namespace orc {
DATE = 15,
VARCHAR = 16,
CHAR = 17,
- TIMESTAMP_INSTANT = 18
+ TIMESTAMP_INSTANT = 18,
+ GEOMETRY = 19,
+ GEOGRAPHY = 20
};
class Type {
@@ -59,6 +73,10 @@ namespace orc {
virtual uint64_t getMaximumLength() const = 0;
virtual uint64_t getPrecision() const = 0;
virtual uint64_t getScale() const = 0;
+ // for geospatial types only
+ virtual const std::string& getCrs() const = 0;
+ // for geography type only
+ virtual geospatial::EdgeInterpolationAlgorithm getAlgorithm() const = 0;
virtual Type& setAttribute(const std::string& key, const std::string& value) = 0;
virtual bool hasAttributeKey(const std::string& key) const = 0;
virtual Type& removeAttribute(const std::string& key) = 0;
@@ -115,6 +133,10 @@ namespace orc {
std::unique_ptr createListType(std::unique_ptr elements);
std::unique_ptr createMapType(std::unique_ptr key, std::unique_ptr value);
std::unique_ptr createUnionType();
+ std::unique_ptr createGeometryType(const std::string& crs = "OGC:CRS84");
+ std::unique_ptr createGeographyType(
+ const std::string& crs = "OGC:CRS84",
+ geospatial::EdgeInterpolationAlgorithm algo = geospatial::SPHERICAL);
} // namespace orc
#endif
diff --git a/c++/include/orc/Vector.hh b/c++/include/orc/Vector.hh
index 0dfe926965..663bef9cd7 100644
--- a/c++/include/orc/Vector.hh
+++ b/c++/include/orc/Vector.hh
@@ -57,6 +57,8 @@ namespace orc {
bool hasNulls;
// whether the vector batch is encoded
bool isEncoded;
+ // whether the dictionary is decoded into vector batch
+ bool dictionaryDecoded;
// custom memory pool
MemoryPool& memoryPool;
@@ -88,6 +90,14 @@ namespace orc {
*/
virtual bool hasVariableLength();
+ /**
+ * Decode possible dictionary into vector batch.
+ */
+ void decodeDictionary();
+
+ protected:
+ virtual void decodeDictionaryImpl() {}
+
private:
ColumnVectorBatch(const ColumnVectorBatch&);
ColumnVectorBatch& operator=(const ColumnVectorBatch&);
@@ -248,6 +258,10 @@ namespace orc {
~EncodedStringVectorBatch() override;
std::string toString() const override;
void resize(uint64_t capacity) override;
+
+ // Calculate data and length in StringVectorBatch from dictionary and index
+ void decodeDictionaryImpl() override;
+
std::shared_ptr dictionary;
// index for dictionary entry
@@ -264,6 +278,9 @@ namespace orc {
bool hasVariableLength() override;
std::vector fields;
+
+ protected:
+ void decodeDictionaryImpl() override;
};
struct ListVectorBatch : public ColumnVectorBatch {
@@ -283,6 +300,9 @@ namespace orc {
// the concatenated elements
std::unique_ptr elements;
+
+ protected:
+ void decodeDictionaryImpl() override;
};
struct MapVectorBatch : public ColumnVectorBatch {
@@ -304,6 +324,9 @@ namespace orc {
std::unique_ptr keys;
// the concatenated elements
std::unique_ptr elements;
+
+ protected:
+ void decodeDictionaryImpl() override;
};
struct UnionVectorBatch : public ColumnVectorBatch {
@@ -327,6 +350,9 @@ namespace orc {
// the sub-columns
std::vector children;
+
+ protected:
+ void decodeDictionaryImpl() override;
};
struct Decimal {
diff --git a/c++/include/orc/Writer.hh b/c++/include/orc/Writer.hh
index 7968fbce7f..78f06739bc 100644
--- a/c++/include/orc/Writer.hh
+++ b/c++/include/orc/Writer.hh
@@ -277,6 +277,32 @@ namespace orc {
* @return if not set, return default value which is 1 MB.
*/
uint64_t getOutputBufferCapacity() const;
+
+ /**
+ * Set the initial block size of original input buffer in the class CompressionStream.
+ * the input buffer is used to store raw data before compression, while the output buffer is
+ * dedicated to holding compressed data
+ */
+ WriterOptions& setMemoryBlockSize(uint64_t capacity);
+
+ /**
+ * Get the initial block size of original input buffer in the class CompressionStream.
+ * @return if not set, return default value which is 64 KB.
+ */
+ uint64_t getMemoryBlockSize() const;
+
+ /**
+ * Set whether the compression block should be aligned to row group boundary.
+ * The boolean type may not be aligned to row group boundary due to the
+ * requirement of the Boolean RLE encoder to pack input bits into bytes
+ */
+ WriterOptions& setAlignBlockBoundToRowGroup(bool alignBlockBoundToRowGroup);
+
+ /**
+ * Get if the compression block should be aligned to row group boundary.
+ * @return if not set, return default value which is false.
+ */
+ bool getAlignBlockBoundToRowGroup() const;
};
class Writer {
diff --git a/c++/include/orc/meson.build b/c++/include/orc/meson.build
new file mode 100644
index 0000000000..e2524051f0
--- /dev/null
+++ b/c++/include/orc/meson.build
@@ -0,0 +1,57 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+cdata = configuration_data()
+cdata.set('ORC_VERSION', meson.project_version())
+cdata.set('ORC_CXX_HAS_CSTDINT', 1)
+
+configure_file(
+ input: 'orc-config.hh.in',
+ output: 'orc-config.hh',
+ configuration: cdata,
+ format: 'cmake',
+ install: true,
+ install_dir: 'orc',
+)
+
+install_headers(
+ [
+ 'BloomFilter.hh',
+ 'ColumnPrinter.hh',
+ 'Common.hh',
+ 'Exceptions.hh',
+ 'Geospatial.hh',
+ 'Int128.hh',
+ 'MemoryPool.hh',
+ 'OrcFile.hh',
+ 'Reader.hh',
+ 'Statistics.hh',
+ 'Type.hh',
+ 'Vector.hh',
+ 'Writer.hh',
+ ],
+ subdir: 'orc',
+)
+
+install_headers(
+ [
+ 'sargs/Literal.hh',
+ 'sargs/SearchArgument.hh',
+ 'sargs/TruthValue.hh',
+ ],
+ subdir: 'orc/sargs',
+)
diff --git a/c++/include/orc/sargs/SearchArgument.hh b/c++/include/orc/sargs/SearchArgument.hh
index 6493840a92..2fa3ea04cb 100644
--- a/c++/include/orc/sargs/SearchArgument.hh
+++ b/c++/include/orc/sargs/SearchArgument.hh
@@ -251,6 +251,12 @@ namespace orc {
* @return the new SearchArgument
*/
virtual std::unique_ptr build() = 0;
+
+ /**
+ * Add a maybe leaf to the current item on the stack.
+ * @return this
+ */
+ virtual SearchArgumentBuilder& maybe() = 0;
};
/**
diff --git a/c++/meson.build b/c++/meson.build
new file mode 100644
index 0000000000..216d7e5634
--- /dev/null
+++ b/c++/meson.build
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# required dependencies
+protobuf_dep = dependency('protobuf', fallback: ['protobuf', 'protobuf_dep'])
+lz4_dep = dependency('liblz4')
+snappy_dep = dependency('snappy')
+zlib_dep = dependency('zlib')
+zstd_dep = dependency('libzstd')
+sparsehash_c11_dep = dependency('sparsehash-c11')
+
+# optional dependencies (should be set later in configuration)
+gtest_dep = disabler()
+gmock_dep = disabler()
+
+subdir('include/orc')
+subdir('src')
+
+if get_option('tests').enabled()
+ gtest_dep = dependency('gtest')
+ gmock_dep = dependency('gmock')
+ subdir('test')
+endif
+
+pkg = import('pkgconfig')
+pkg.generate(orc_lib)
diff --git a/c++/orcConfig.cmake.in b/c++/orcConfig.cmake.in
new file mode 100644
index 0000000000..49663b3423
--- /dev/null
+++ b/c++/orcConfig.cmake.in
@@ -0,0 +1,103 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# This config sets the following variables in your project:
+#
+# orc_VERSION - version of the found ORC
+# orc_FOUND - true if ORC found on the system
+#
+# This config sets the following targets in your project:
+#
+# orc::orc - for linked as static library
+#
+# For backward compatibility, this config also sets the following variables:
+#
+# ORC_FOUND - same as orc_FOUND above
+# ORC_STATIC_LIB - static library of the found ORC
+# ORC_INCLUDE_DIR - include directory of the found ORC
+# ORC_INCLUDE_DIRS - same as ORC_INCLUDE_DIR above
+
+@PACKAGE_INIT@
+
+set(ORC_VENDOR_DEPENDENCIES "@ORC_VENDOR_DEPENDENCIES@")
+set(ORC_SYSTEM_DEPENDENCIES "@ORC_SYSTEM_DEPENDENCIES@")
+
+if(DEFINED CMAKE_MODULE_PATH)
+ set(ORC_CMAKE_MODULE_PATH_OLD ${CMAKE_MODULE_PATH})
+else()
+ unset(ORC_CMAKE_MODULE_PATH_OLD)
+endif()
+set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}")
+
+include(CMakeFindDependencyMacro)
+foreach(dependency ${ORC_SYSTEM_DEPENDENCIES})
+ find_dependency(${dependency})
+endforeach()
+
+if(DEFINED ORC_CMAKE_MODULE_PATH_OLD)
+ set(CMAKE_MODULE_PATH ${ORC_CMAKE_MODULE_PATH_OLD})
+ unset(ORC_CMAKE_MODULE_PATH_OLD)
+else()
+ unset(CMAKE_MODULE_PATH)
+endif()
+
+include("${CMAKE_CURRENT_LIST_DIR}/orcTargets.cmake")
+
+get_target_property(orc_static_configurations orc::orc IMPORTED_CONFIGURATIONS)
+
+foreach(dependency ${ORC_VENDOR_DEPENDENCIES})
+ string(REPLACE "|" ";" dependency_pair ${dependency})
+ list(LENGTH dependency_pair dependency_pair_length)
+ if(NOT dependency_pair_length EQUAL 2)
+ message(FATAL_ERROR "Invalid vendor dependency: ${dependency}")
+ endif()
+ list(GET dependency_pair 0 target_name)
+ list(GET dependency_pair 1 static_lib_name)
+
+ add_library("${target_name}" STATIC IMPORTED)
+
+ foreach(CONFIGURATION ${orc_static_configurations})
+ string(TOUPPER "${CONFIGURATION}" CONFIGURATION)
+ get_target_property(orc_static_location orc::orc LOCATION_${CONFIGURATION})
+ get_filename_component(orc_lib_dir "${orc_static_location}" DIRECTORY)
+ set_property(TARGET "${target_name}"
+ APPEND
+ PROPERTY IMPORTED_CONFIGURATIONS ${CONFIGURATION})
+ set_target_properties("${target_name}"
+ PROPERTIES IMPORTED_LOCATION_${CONFIGURATION}
+ "${orc_lib_dir}/${static_lib_name}")
+ endforeach()
+endforeach()
+
+check_required_components(orc)
+
+foreach(BUILD_TYPE_SUFFIX
+ "_RELEASE"
+ "_RELWITHDEBINFO"
+ "_MINSIZEREL"
+ "_DEBUG"
+ "")
+ if(NOT ORC_STATIC_LIB)
+ get_target_property(ORC_STATIC_LIB orc::orc IMPORTED_LOCATION${BUILD_TYPE_SUFFIX})
+ endif()
+endforeach()
+
+get_target_property(ORC_INCLUDE_DIR orc::orc INTERFACE_INCLUDE_DIRECTORIES)
+
+set(ORC_FOUND TRUE)
+set(ORC_VERSION ${orc_VERSION})
+set(ORC_INCLUDE_DIRS ${ORC_INCLUDE_DIR})
diff --git a/c++/src/Adaptor.hh.in b/c++/src/Adaptor.hh.in
index 2cce8158e2..f3ed763eb3 100644
--- a/c++/src/Adaptor.hh.in
+++ b/c++/src/Adaptor.hh.in
@@ -49,6 +49,12 @@ typedef SSIZE_T ssize_t;
ssize_t pread(int fd, void* buf, size_t count, off_t offset);
#endif
+#if defined(__GNUC__) || defined(__clang__)
+ #define NO_SANITIZE_ATTR __attribute__((no_sanitize("signed-integer-overflow", "shift")))
+#else
+ #define NO_SANITIZE_ATTR
+#endif
+
#ifdef HAS_DIAGNOSTIC_PUSH
#ifdef __clang__
#define DIAGNOSTIC_PUSH _Pragma("clang diagnostic push")
diff --git a/c++/src/BlockBuffer.hh b/c++/src/BlockBuffer.hh
index 2faf38f7f9..6d265b0e32 100644
--- a/c++/src/BlockBuffer.hh
+++ b/c++/src/BlockBuffer.hh
@@ -106,12 +106,14 @@ namespace orc {
}
void resize(uint64_t size);
+
/**
* Requests the BlockBuffer to contain at least newCapacity bytes.
* Reallocation happens if there is need of more space.
* @param newCapacity new capacity of BlockBuffer
*/
void reserve(uint64_t newCapacity);
+
/**
* Write the BlockBuffer content into OutputStream
* @param output the output stream to write to
diff --git a/c++/src/BloomFilter.cc b/c++/src/BloomFilter.cc
index 887637223a..025bdd8a03 100644
--- a/c++/src/BloomFilter.cc
+++ b/c++/src/BloomFilter.cc
@@ -208,7 +208,7 @@ namespace orc {
}
DIAGNOSTIC_POP
-
+ NO_SANITIZE_ATTR
void BloomFilterImpl::addHash(int64_t hash64) {
int32_t hash1 = static_cast(hash64 & 0xffffffff);
// In Java codes, we use "hash64 >>> 32" which is an unsigned shift op.
@@ -226,6 +226,7 @@ namespace orc {
}
}
+ NO_SANITIZE_ATTR
bool BloomFilterImpl::testHash(int64_t hash64) const {
int32_t hash1 = static_cast(hash64 & 0xffffffff);
// In Java codes, we use "hash64 >>> 32" which is an unsigned shift op.
diff --git a/c++/src/BloomFilter.hh b/c++/src/BloomFilter.hh
index ebc4a5ee04..75fb02a026 100644
--- a/c++/src/BloomFilter.hh
+++ b/c++/src/BloomFilter.hh
@@ -194,6 +194,7 @@ namespace orc {
// Thomas Wang's integer hash function
// http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm
// Put this in header file so tests can use it as well.
+ NO_SANITIZE_ATTR
inline int64_t getLongHash(int64_t key) {
key = (~key) + (key << 21); // key = (key << 21) - key - 1;
key = key ^ (key >> 24);
diff --git a/c++/src/ByteRLE.cc b/c++/src/ByteRLE.cc
index bdbaad1da6..ded9f55a00 100644
--- a/c++/src/ByteRLE.cc
+++ b/c++/src/ByteRLE.cc
@@ -63,6 +63,8 @@ namespace orc {
virtual void suppress() override;
+ virtual void finishEncode() override;
+
/**
* Reset to initial state
*/
@@ -186,16 +188,17 @@ namespace orc {
void ByteRleEncoderImpl::recordPosition(PositionRecorder* recorder) const {
uint64_t flushedSize = outputStream->getSize();
- uint64_t unflushedSize = static_cast(bufferPosition);
+ uint64_t unusedBufferSize = static_cast(bufferLength - bufferPosition);
if (outputStream->isCompressed()) {
// start of the compression chunk in the stream
recorder->add(flushedSize);
- // number of decompressed bytes that need to be consumed
- recorder->add(unflushedSize);
+ // There are multiple blocks in the input buffer, but bufferPosition only records the
+ // effective length of the last block. We need rawInputBufferSize to record the total length
+ // of all variable blocks.
+ recorder->add(outputStream->getRawInputBufferSize() - unusedBufferSize);
} else {
- flushedSize -= static_cast(bufferLength);
// byte offset of the RLE run’s start location
- recorder->add(flushedSize + unflushedSize);
+ recorder->add(flushedSize - unusedBufferSize);
}
recorder->add(static_cast(numLiterals));
}
@@ -215,6 +218,13 @@ namespace orc {
reset();
}
+ void ByteRleEncoderImpl::finishEncode() {
+ writeValues();
+ outputStream->BackUp(bufferLength - bufferPosition);
+ outputStream->finishStream();
+ bufferLength = bufferPosition = 0;
+ }
+
std::unique_ptr createByteRleEncoder(
std::unique_ptr output) {
return std::make_unique(std::move(output));
diff --git a/c++/src/ByteRLE.hh b/c++/src/ByteRLE.hh
index bd19f52ecc..bee064f666 100644
--- a/c++/src/ByteRLE.hh
+++ b/c++/src/ByteRLE.hh
@@ -59,6 +59,13 @@ namespace orc {
* suppress the data and reset to initial state
*/
virtual void suppress() = 0;
+
+ /**
+ * Finalize the encoding process. This function should be called after all data required for
+ * encoding has been added. It ensures that any remaining data is processed and the final state
+ * of the encoder is set.
+ */
+ virtual void finishEncode() = 0;
};
class ByteRleDecoder {
diff --git a/c++/src/CMakeLists.txt b/c++/src/CMakeLists.txt
index 33ad584840..b8a168307c 100644
--- a/c++/src/CMakeLists.txt
+++ b/c++/src/CMakeLists.txt
@@ -138,12 +138,6 @@ configure_file (
"${CMAKE_CURRENT_BINARY_DIR}/Adaptor.hh"
)
-include_directories (
- ${CMAKE_CURRENT_SOURCE_DIR}
- ${CMAKE_CURRENT_BINARY_DIR}
- ${LIBHDFSPP_INCLUDE_DIR}
- )
-
add_custom_command(OUTPUT orc_proto.pb.h orc_proto.pb.cc
COMMAND ${PROTOBUF_EXECUTABLE}
-I ../../orc-format_ep-prefix/src/orc-format_ep/src/main/proto/orc/proto
@@ -156,6 +150,7 @@ set(SOURCE_FILES
orc_proto.pb.h
io/InputStream.cc
io/OutputStream.cc
+ io/Cache.cc
sargs/ExpressionTree.cc
sargs/Literal.cc
sargs/PredicateLeaf.cc
@@ -176,6 +171,7 @@ set(SOURCE_FILES
ConvertColumnReader.cc
CpuInfoUtil.cc
Exceptions.cc
+ Geospatial.cc
Int128.cc
LzoDecompressor.cc
MemoryPool.cc
@@ -197,7 +193,6 @@ set(SOURCE_FILES
if(BUILD_LIBHDFSPP)
set(SOURCE_FILES ${SOURCE_FILES} OrcHdfsFile.cc)
- add_definitions(-DBUILD_LIBHDFSPP)
endif(BUILD_LIBHDFSPP)
if(BUILD_ENABLE_AVX512)
@@ -209,14 +204,46 @@ endif(BUILD_ENABLE_AVX512)
add_library (orc STATIC ${SOURCE_FILES})
target_link_libraries (orc
- orc::protobuf
- orc::zlib
- orc::snappy
- orc::lz4
- orc::zstd
- ${LIBHDFSPP_LIBRARIES}
+ INTERFACE
+ ${ORC_INSTALL_INTERFACE_TARGETS}
+ PRIVATE
+ $
+ $
+ $
+ $
+ $
+ $
+ $
)
+target_include_directories (orc
+ INTERFACE
+ $
+ PUBLIC
+ $
+ $
+ PRIVATE
+ ${CMAKE_CURRENT_BINARY_DIR}
+ ${CMAKE_CURRENT_SOURCE_DIR}
+ ${LIBHDFSPP_INCLUDE_DIR}
+)
+
+if (BUILD_LIBHDFSPP)
+ target_compile_definitions(orc PUBLIC -DBUILD_LIBHDFSPP)
+endif (BUILD_LIBHDFSPP)
+
+if (BUILD_CPP_ENABLE_METRICS)
+ message(STATUS "Enable the metrics collection")
+ target_compile_definitions(orc PUBLIC ENABLE_METRICS=1)
+else ()
+ message(STATUS "Disable the metrics collection")
+ target_compile_definitions(orc PUBLIC ENABLE_METRICS=0)
+endif ()
+
add_dependencies(orc orc-format_ep)
-install(TARGETS orc DESTINATION lib)
+install(TARGETS orc EXPORT orc_targets)
+install(EXPORT orc_targets
+ DESTINATION ${ORC_INSTALL_CMAKE_DIR}
+ NAMESPACE "orc::"
+ FILE "orcTargets.cmake")
diff --git a/c++/src/ColumnPrinter.cc b/c++/src/ColumnPrinter.cc
index 8b16ecbd09..6535c612ce 100644
--- a/c++/src/ColumnPrinter.cc
+++ b/c++/src/ColumnPrinter.cc
@@ -254,6 +254,8 @@ namespace orc {
break;
case BINARY:
+ case GEOMETRY:
+ case GEOGRAPHY:
result = std::make_unique(buffer, param);
break;
diff --git a/c++/src/ColumnReader.cc b/c++/src/ColumnReader.cc
index e70f916ffd..89ff0e0245 100644
--- a/c++/src/ColumnReader.cc
+++ b/c++/src/ColumnReader.cc
@@ -395,7 +395,7 @@ namespace orc {
int64_t bits = 0;
if (bufferEnd_ - bufferPointer_ >= 8) {
if (isLittleEndian) {
- bits = *(reinterpret_cast(bufferPointer_));
+ memcpy(&bits, bufferPointer_, sizeof(bits));
} else {
bits = static_cast(static_cast(bufferPointer_[0]));
bits |= static_cast(static_cast(bufferPointer_[1])) << 8;
@@ -509,8 +509,10 @@ namespace orc {
bufferNum = std::min(numValues,
static_cast(bufferEnd_ - bufferPointer_) / bytesPerValue_);
uint64_t bufferBytes = bufferNum * bytesPerValue_;
- memcpy(outArray, bufferPointer_, bufferBytes);
- bufferPointer_ += bufferBytes;
+ if (bufferBytes > 0) {
+ memcpy(outArray, bufferPointer_, bufferBytes);
+ bufferPointer_ += bufferBytes;
+ }
}
for (size_t i = bufferNum; i < numValues; ++i) {
outArray[i] = readDouble();
@@ -724,6 +726,9 @@ namespace orc {
if (totalBytes <= lastBufferLength_) {
// subtract the needed bytes from the ones left over
lastBufferLength_ -= totalBytes;
+ if (lastBuffer_ == nullptr) {
+ throw ParseError("StringDirectColumnReader::skip: lastBuffer_ is null");
+ }
lastBuffer_ += totalBytes;
} else {
// move the stream forward after accounting for the buffered bytes
@@ -778,7 +783,9 @@ namespace orc {
byteBatch.blob.resize(totalLength);
char* ptr = byteBatch.blob.data();
while (bytesBuffered + lastBufferLength_ < totalLength) {
- memcpy(ptr + bytesBuffered, lastBuffer_, lastBufferLength_);
+ if (lastBuffer_ != nullptr) {
+ memcpy(ptr + bytesBuffered, lastBuffer_, lastBufferLength_);
+ }
bytesBuffered += lastBufferLength_;
const void* readBuffer;
int readLength;
@@ -1740,6 +1747,8 @@ namespace orc {
case CHAR:
case STRING:
case VARCHAR:
+ case GEOMETRY:
+ case GEOGRAPHY:
switch (static_cast(stripe.getEncoding(type.getColumnId()).kind())) {
case proto::ColumnEncoding_Kind_DICTIONARY:
case proto::ColumnEncoding_Kind_DICTIONARY_V2:
diff --git a/c++/src/ColumnWriter.cc b/c++/src/ColumnWriter.cc
index 86e30ce90d..915277ef41 100644
--- a/c++/src/ColumnWriter.cc
+++ b/c++/src/ColumnWriter.cc
@@ -17,13 +17,19 @@
*/
#include "orc/Int128.hh"
+#include "orc/Statistics.hh"
+#include "orc/Type.hh"
#include "orc/Writer.hh"
+#include
#include "ByteRLE.hh"
#include "ColumnWriter.hh"
#include "RLE.hh"
#include "Statistics.hh"
#include "Timezone.hh"
+#include "Utils.hh"
+
+#include
namespace orc {
StreamsFactory::~StreamsFactory() {
@@ -47,11 +53,11 @@ namespace orc {
// In the future, we can decide compression strategy and modifier
// based on stream kind. But for now we just use the setting from
// WriterOption
- return createCompressor(options_.getCompression(), outStream_,
- options_.getCompressionStrategy(),
- // BufferedOutputStream initial capacity
- options_.getOutputBufferCapacity(), options_.getCompressionBlockSize(),
- *options_.getMemoryPool(), options_.getWriterMetrics());
+ return createCompressor(
+ options_.getCompression(), outStream_, options_.getCompressionStrategy(),
+ // BufferedOutputStream initial capacity
+ options_.getOutputBufferCapacity(), options_.getCompressionBlockSize(),
+ options_.getMemoryBlockSize(), *options_.getMemoryPool(), options_.getWriterMetrics());
}
std::unique_ptr createStreamsFactory(const WriterOptions& options,
@@ -253,6 +259,10 @@ namespace orc {
// PASS
}
+ void ColumnWriter::finishStreams() {
+ notNullEncoder->finishEncode();
+ }
+
class StructColumnWriter : public ColumnWriter {
public:
StructColumnWriter(const Type& type, const StreamsFactory& factory,
@@ -282,6 +292,8 @@ namespace orc {
virtual void reset() override;
+ virtual void finishStreams() override;
+
private:
std::vector> children_;
};
@@ -415,6 +427,13 @@ namespace orc {
}
}
+ void StructColumnWriter::finishStreams() {
+ ColumnWriter::finishStreams();
+ for (uint32_t i = 0; i < children_.size(); ++i) {
+ children_[i]->finishStreams();
+ }
+ }
+
template
class IntegerColumnWriter : public ColumnWriter {
public:
@@ -432,6 +451,8 @@ namespace orc {
virtual void recordPosition() const override;
+ virtual void finishStreams() override;
+
protected:
std::unique_ptr rleEncoder;
@@ -527,6 +548,12 @@ namespace orc {
rleEncoder->recordPosition(rowIndexPosition.get());
}
+ template
+ void IntegerColumnWriter::finishStreams() {
+ ColumnWriter::finishStreams();
+ rleEncoder->finishEncode();
+ }
+
template
class ByteColumnWriter : public ColumnWriter {
public:
@@ -543,6 +570,8 @@ namespace orc {
virtual void recordPosition() const override;
+ virtual void finishStreams() override;
+
private:
std::unique_ptr byteRleEncoder_;
};
@@ -591,7 +620,7 @@ namespace orc {
if (enableBloomFilter) {
bloomFilter->addLong(data[i]);
}
- intStats->update(static_cast(byteData[i]), 1);
+ intStats->update(static_cast(static_cast(byteData[i])), 1);
}
}
intStats->increase(count);
@@ -636,6 +665,12 @@ namespace orc {
byteRleEncoder_->recordPosition(rowIndexPosition.get());
}
+ template
+ void ByteColumnWriter::finishStreams() {
+ ColumnWriter::finishStreams();
+ byteRleEncoder_->finishEncode();
+ }
+
template
class BooleanColumnWriter : public ColumnWriter {
public:
@@ -653,6 +688,8 @@ namespace orc {
virtual void recordPosition() const override;
+ virtual void finishStreams() override;
+
private:
std::unique_ptr rleEncoder_;
};
@@ -749,6 +786,12 @@ namespace orc {
rleEncoder_->recordPosition(rowIndexPosition.get());
}
+ template
+ void BooleanColumnWriter::finishStreams() {
+ ColumnWriter::finishStreams();
+ rleEncoder_->finishEncode();
+ }
+
template
class FloatingColumnWriter : public ColumnWriter {
public:
@@ -766,6 +809,8 @@ namespace orc {
virtual void recordPosition() const override;
+ virtual void finishStreams() override;
+
private:
bool isFloat_;
std::unique_ptr dataStream_;
@@ -877,30 +922,36 @@ namespace orc {
dataStream_->recordPosition(rowIndexPosition.get());
}
+ template
+ void FloatingColumnWriter::finishStreams() {
+ ColumnWriter::finishStreams();
+ dataStream_->finishStream();
+ }
+
/**
* Implementation of increasing sorted string dictionary
*/
class SortedStringDictionary {
public:
struct DictEntry {
- DictEntry(const char* str, size_t len) : data(str), length(len) {}
- const char* data;
- size_t length;
+ DictEntry(const char* str, size_t len) : data(std::make_unique(str, len)) {}
+
+ std::unique_ptr data;
};
- SortedStringDictionary() : totalLength_(0) {}
+ SortedStringDictionary() : totalLength_(0) {
+ /// Need to set empty key otherwise dense_hash_map will not work correctly
+ keyToIndex_.set_empty_key(std::string_view{});
+ }
// insert a new string into dictionary, return its insertion order
- size_t insert(const char* data, size_t len);
+ size_t insert(const char* str, size_t len);
// write dictionary data & length to output buffer
void flush(AppendOnlyBufferedStream* dataStream, RleEncoder* lengthEncoder) const;
- // reorder input index buffer from insertion order to dictionary order
- void reorder(std::vector& idxBuffer) const;
-
// get dict entries in insertion order
- void getEntriesInInsertionOrder(std::vector&) const;
+ const std::vector& getEntriesInInsertionOrder() const;
// return count of entries
size_t size() const;
@@ -911,18 +962,11 @@ namespace orc {
void clear();
private:
- struct LessThan {
- bool operator()(const DictEntry& left, const DictEntry& right) const {
- int ret = memcmp(left.data, right.data, std::min(left.length, right.length));
- if (ret != 0) {
- return ret < 0;
- }
- return left.length < right.length;
- }
- };
+ // store dictionary entries in insertion order
+ mutable std::vector flatDict_;
- std::map dict_;
- std::vector> data_;
+ // map from string to its insertion order index
+ google::dense_hash_map keyToIndex_;
uint64_t totalLength_;
// use friend class here to avoid being bothered by const function calls
@@ -935,64 +979,39 @@ namespace orc {
// insert a new string into dictionary, return its insertion order
size_t SortedStringDictionary::insert(const char* str, size_t len) {
- auto ret = dict_.insert({DictEntry(str, len), dict_.size()});
- if (ret.second) {
- // make a copy to internal storage
- data_.push_back(std::vector(len));
- memcpy(data_.back().data(), str, len);
- // update dictionary entry to link pointer to internal storage
- DictEntry* entry = const_cast(&(ret.first->first));
- entry->data = data_.back().data();
+ size_t index = flatDict_.size();
+
+ auto it = keyToIndex_.find(std::string_view{str, len});
+ if (it != keyToIndex_.end()) {
+ return it->second;
+ } else {
+ flatDict_.emplace_back(str, len);
totalLength_ += len;
+
+ const auto& lastEntry = flatDict_.back();
+ keyToIndex_.emplace(std::string_view{lastEntry.data->data(), lastEntry.data->size()}, index);
+ return index;
}
- return ret.first->second;
}
// write dictionary data & length to output buffer
void SortedStringDictionary::flush(AppendOnlyBufferedStream* dataStream,
RleEncoder* lengthEncoder) const {
- for (auto it = dict_.cbegin(); it != dict_.cend(); ++it) {
- dataStream->write(it->first.data, it->first.length);
- lengthEncoder->write(static_cast(it->first.length));
- }
- }
-
- /**
- * Reorder input index buffer from insertion order to dictionary order
- *
- * We require this function because string values are buffered by indexes
- * in their insertion order. Until the entire dictionary is complete can
- * we get their sorted indexes in the dictionary in that ORC specification
- * demands dictionary should be ordered. Therefore this function transforms
- * the indexes from insertion order to dictionary value order for final
- * output.
- */
- void SortedStringDictionary::reorder(std::vector& idxBuffer) const {
- // iterate the dictionary to get mapping from insertion order to value order
- std::vector mapping(dict_.size());
- size_t dictIdx = 0;
- for (auto it = dict_.cbegin(); it != dict_.cend(); ++it) {
- mapping[it->second] = dictIdx++;
- }
-
- // do the transformation
- for (size_t i = 0; i != idxBuffer.size(); ++i) {
- idxBuffer[i] = static_cast(mapping[static_cast(idxBuffer[i])]);
+ for (const auto& entry : flatDict_) {
+ dataStream->write(entry.data->data(), entry.data->size());
+ lengthEncoder->write(static_cast(entry.data->size()));
}
}
// get dict entries in insertion order
- void SortedStringDictionary::getEntriesInInsertionOrder(
- std::vector& entries) const {
- entries.resize(dict_.size());
- for (auto it = dict_.cbegin(); it != dict_.cend(); ++it) {
- entries[it->second] = &(it->first);
- }
+ const std::vector&
+ SortedStringDictionary::getEntriesInInsertionOrder() const {
+ return flatDict_;
}
// return count of entries
size_t SortedStringDictionary::size() const {
- return dict_.size();
+ return flatDict_.size();
}
// return total length of strings in the dictioanry
@@ -1002,8 +1021,8 @@ namespace orc {
void SortedStringDictionary::clear() {
totalLength_ = 0;
- data_.clear();
- dict_.clear();
+ keyToIndex_.clear();
+ flatDict_.clear();
}
class StringColumnWriter : public ColumnWriter {
@@ -1028,6 +1047,8 @@ namespace orc {
virtual void reset() override;
+ virtual void finishStreams() override;
+
private:
/**
* dictionary related functions
@@ -1221,6 +1242,14 @@ namespace orc {
}
}
+ void StringColumnWriter::finishStreams() {
+ ColumnWriter::finishStreams();
+ if (!useDictionary) {
+ directDataStream->finishStream();
+ directLengthEncoder->finishEncode();
+ }
+ }
+
bool StringColumnWriter::checkDictionaryKeyRatio() {
if (!doneDictionaryCheck) {
useDictionary = dictionary.size() <=
@@ -1295,9 +1324,6 @@ namespace orc {
// flush dictionary data & length streams
dictionary.flush(dictStream.get(), dictLengthEncoder.get());
- // convert index from insertion order to dictionary order
- dictionary.reorder(dictionary.idxInDictBuffer_);
-
// write data sequences
int64_t* data = dictionary.idxInDictBuffer_.data();
if (enableIndex) {
@@ -1341,90 +1367,19 @@ namespace orc {
}
// get dictionary entries in insertion order
- std::vector entries;
- dictionary.getEntriesInInsertionOrder(entries);
+ const auto& entries = dictionary.getEntriesInInsertionOrder();
// store each length of the data into a vector
- const SortedStringDictionary::DictEntry* dictEntry = nullptr;
for (uint64_t i = 0; i != dictionary.idxInDictBuffer_.size(); ++i) {
// write one row data in direct encoding
- dictEntry = entries[static_cast(dictionary.idxInDictBuffer_[i])];
- directDataStream->write(dictEntry->data, dictEntry->length);
- directLengthEncoder->write(static_cast(dictEntry->length));
+ const auto& dictEntry = entries[static_cast(dictionary.idxInDictBuffer_[i])];
+ directDataStream->write(dictEntry.data->data(), dictEntry.data->size());
+ directLengthEncoder->write(static_cast(dictEntry.data->size()));
}
deleteDictStreams();
}
- struct Utf8Utils {
- /**
- * Counts how many utf-8 chars of the input data
- */
- static uint64_t charLength(const char* data, uint64_t length) {
- uint64_t chars = 0;
- for (uint64_t i = 0; i < length; i++) {
- if (isUtfStartByte(data[i])) {
- chars++;
- }
- }
- return chars;
- }
-
- /**
- * Return the number of bytes required to read at most maxCharLength
- * characters in full from a utf-8 encoded byte array provided
- * by data. This does not validate utf-8 data, but
- * operates correctly on already valid utf-8 data.
- *
- * @param maxCharLength number of characters required
- * @param data the bytes of UTF-8
- * @param length the length of data to truncate
- */
- static uint64_t truncateBytesTo(uint64_t maxCharLength, const char* data, uint64_t length) {
- uint64_t chars = 0;
- if (length <= maxCharLength) {
- return length;
- }
- for (uint64_t i = 0; i < length; i++) {
- if (isUtfStartByte(data[i])) {
- chars++;
- }
- if (chars > maxCharLength) {
- return i;
- }
- }
- // everything fits
- return length;
- }
-
- /**
- * Checks if b is the first byte of a UTF-8 character.
- */
- inline static bool isUtfStartByte(char b) {
- return (b & 0xC0) != 0x80;
- }
-
- /**
- * Find the start of the last character that ends in the current string.
- * @param text the bytes of the utf-8
- * @param from the first byte location
- * @param until the last byte location
- * @return the index of the last character
- */
- static uint64_t findLastCharacter(const char* text, uint64_t from, uint64_t until) {
- uint64_t posn = until;
- /* we don't expect characters more than 5 bytes */
- while (posn >= from) {
- if (isUtfStartByte(text[posn])) {
- return posn;
- }
- posn -= 1;
- }
- /* beginning of a valid char not found */
- throw std::logic_error("Could not truncate string, beginning of a valid char not found");
- }
- };
-
class CharColumnWriter : public StringColumnWriter {
public:
CharColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options)
@@ -1639,6 +1594,8 @@ namespace orc {
virtual void recordPosition() const override;
+ virtual void finishStreams() override;
+
protected:
std::unique_ptr secRleEncoder, nanoRleEncoder;
@@ -1779,6 +1736,12 @@ namespace orc {
nanoRleEncoder->recordPosition(rowIndexPosition.get());
}
+ void TimestampColumnWriter::finishStreams() {
+ ColumnWriter::finishStreams();
+ secRleEncoder->finishEncode();
+ nanoRleEncoder->finishEncode();
+ }
+
class DateColumnWriter : public IntegerColumnWriter {
public:
DateColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options);
@@ -1848,6 +1811,8 @@ namespace orc {
virtual void recordPosition() const override;
+ virtual void finishStreams() override;
+
protected:
RleVersion rleVersion;
uint64_t precision;
@@ -1966,6 +1931,12 @@ namespace orc {
scaleEncoder->recordPosition(rowIndexPosition.get());
}
+ void Decimal64ColumnWriter::finishStreams() {
+ ColumnWriter::finishStreams();
+ valueStream->finishStream();
+ scaleEncoder->finishEncode();
+ }
+
class Decimal64ColumnWriterV2 : public ColumnWriter {
public:
Decimal64ColumnWriterV2(const Type& type, const StreamsFactory& factory,
@@ -1982,6 +1953,8 @@ namespace orc {
virtual void recordPosition() const override;
+ virtual void finishStreams() override;
+
protected:
uint64_t precision;
uint64_t scale;
@@ -2072,6 +2045,11 @@ namespace orc {
valueEncoder->recordPosition(rowIndexPosition.get());
}
+ void Decimal64ColumnWriterV2::finishStreams() {
+ ColumnWriter::finishStreams();
+ valueEncoder->finishEncode();
+ }
+
class Decimal128ColumnWriter : public Decimal64ColumnWriter {
public:
Decimal128ColumnWriter(const Type& type, const StreamsFactory& factory,
@@ -2187,6 +2165,8 @@ namespace orc {
virtual void reset() override;
+ virtual void finishStreams() override;
+
private:
std::unique_ptr lengthEncoder_;
RleVersion rleVersion_;
@@ -2363,6 +2343,14 @@ namespace orc {
}
}
+ void ListColumnWriter::finishStreams() {
+ ColumnWriter::finishStreams();
+ lengthEncoder_->finishEncode();
+ if (child_) {
+ child_->finishStreams();
+ }
+ }
+
class MapColumnWriter : public ColumnWriter {
public:
MapColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options);
@@ -2395,6 +2383,8 @@ namespace orc {
virtual void reset() override;
+ virtual void finishStreams() override;
+
private:
std::unique_ptr keyWriter_;
std::unique_ptr elemWriter_;
@@ -2613,6 +2603,17 @@ namespace orc {
}
}
+ void MapColumnWriter::finishStreams() {
+ ColumnWriter::finishStreams();
+ lengthEncoder_->finishEncode();
+ if (keyWriter_) {
+ keyWriter_->finishStreams();
+ }
+ if (elemWriter_) {
+ elemWriter_->finishStreams();
+ }
+ }
+
class UnionColumnWriter : public ColumnWriter {
public:
UnionColumnWriter(const Type& type, const StreamsFactory& factory,
@@ -2645,6 +2646,8 @@ namespace orc {
virtual void reset() override;
+ virtual void finishStreams() override;
+
private:
std::unique_ptr rleEncoder_;
std::vector> children_;
@@ -2816,6 +2819,73 @@ namespace orc {
}
}
+ void UnionColumnWriter::finishStreams() {
+ ColumnWriter::finishStreams();
+ rleEncoder_->finishEncode();
+ for (uint32_t i = 0; i < children_.size(); ++i) {
+ children_[i]->finishStreams();
+ }
+ }
+
+ class GeospatialColumnWriter : public BinaryColumnWriter {
+ public:
+ GeospatialColumnWriter(const Type& type, const StreamsFactory& factory,
+ const WriterOptions& options)
+ : BinaryColumnWriter(type, factory, options),
+ isGeometry_(type.getKind() == TypeKind::GEOMETRY) {}
+
+ virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
+ const char* incomingMask) override {
+ ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
+
+ const StringVectorBatch* strBatch = dynamic_cast(&rowBatch);
+ if (strBatch == nullptr) {
+ throw InvalidArgument("Failed to cast to StringVectorBatch");
+ }
+ auto data = &strBatch->data[offset];
+ auto length = &strBatch->length[offset];
+ const char* notNull = strBatch->hasNulls ? strBatch->notNull.data() + offset : nullptr;
+
+ bool hasNull = false;
+ GeospatialColumnStatisticsImpl* geoStats = nullptr;
+ if (isGeometry_) {
+ geoStats = dynamic_cast(colIndexStatistics.get());
+ }
+
+ uint64_t count = 0;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (notNull == nullptr || notNull[i]) {
+ uint64_t len = static_cast(length[i]);
+ directDataStream->write(data[i], len);
+
+ // update stats
+ if (geoStats) {
+ ++count;
+ geoStats->update(data[i], len);
+ }
+
+ if (enableBloomFilter) {
+ bloomFilter->addBytes(data[i], length[i]);
+ }
+ } else if (!hasNull) {
+ hasNull = true;
+ if (geoStats) {
+ geoStats->setHasNull(hasNull);
+ }
+ }
+ }
+
+ directLengthEncoder->add(length, numValues, notNull);
+
+ if (geoStats) {
+ geoStats->increase(count);
+ }
+ }
+
+ private:
+ bool isGeometry_;
+ };
+
std::unique_ptr buildWriter(const Type& type, const StreamsFactory& factory,
const WriterOptions& options) {
switch (static_cast(type.getKind())) {
@@ -2886,6 +2956,9 @@ namespace orc {
return std::make_unique(type, factory, options);
case UNION:
return std::make_unique(type, factory, options);
+ case GEOMETRY:
+ case GEOGRAPHY:
+ return std::make_unique(type, factory, options);
default:
throw NotImplementedYet(
"Type is not supported yet for creating "
diff --git a/c++/src/ColumnWriter.hh b/c++/src/ColumnWriter.hh
index 8afd1eb72c..1c5e15d707 100644
--- a/c++/src/ColumnWriter.hh
+++ b/c++/src/ColumnWriter.hh
@@ -179,6 +179,18 @@ namespace orc {
*/
virtual void writeDictionary();
+ /**
+ * Finalize the encoding and compressing process. This function should be
+ * called after all data required for encoding has been added. It ensures
+ * that any remaining data is processed and the final state of the streams
+ * is set.
+ * Note: boolean type cannot cut off the current byte if it is not filled
+ * with 8 bits, otherwise Boolean RLE may incorrectly read the unfilled
+ * trailing bits. In this case, the last byte will be the head of the next
+ * compression block.
+ */
+ virtual void finishStreams();
+
protected:
/**
* Utility function to translate ColumnStatistics into protobuf form and
diff --git a/c++/src/Compression.cc b/c++/src/Compression.cc
index 4002276e18..f373a75bff 100644
--- a/c++/src/Compression.cc
+++ b/c++/src/Compression.cc
@@ -52,19 +52,22 @@ namespace orc {
class CompressionStreamBase : public BufferedOutputStream {
public:
CompressionStreamBase(OutputStream* outStream, int compressionLevel, uint64_t capacity,
- uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics);
+ uint64_t compressionBlockSize, uint64_t memoryBlockSize, MemoryPool& pool,
+ WriterMetrics* metrics);
virtual bool Next(void** data, int* size) override = 0;
- virtual void BackUp(int count) override;
+ virtual void BackUp(int count) override = 0;
virtual std::string getName() const override = 0;
- virtual uint64_t flush() override;
- virtual void suppress() override;
+ virtual uint64_t flush() override = 0;
+ virtual void suppress() override = 0;
virtual bool isCompressed() const override {
return true;
}
virtual uint64_t getSize() const override;
+ virtual uint64_t getRawInputBufferSize() const override = 0;
+ virtual void finishStream() override = 0;
protected:
void writeData(const unsigned char* data, int size);
@@ -78,9 +81,6 @@ namespace orc {
// ensure enough room for compression block header
void ensureHeader();
- // Buffer to hold uncompressed data until user calls Next()
- DataBuffer rawInputBuffer;
-
// Compress level
int level;
@@ -99,46 +99,26 @@ namespace orc {
// Compression block header pointer array
static const uint32_t HEADER_SIZE = 3;
std::array header;
+
+ // Compression block size
+ uint64_t compressionBlockSize;
};
CompressionStreamBase::CompressionStreamBase(OutputStream* outStream, int compressionLevel,
- uint64_t capacity, uint64_t blockSize,
- MemoryPool& pool, WriterMetrics* metrics)
- : BufferedOutputStream(pool, outStream, capacity, blockSize, metrics),
- rawInputBuffer(pool, blockSize),
+ uint64_t capacity, uint64_t compressionBlockSize,
+ uint64_t memoryBlockSize, MemoryPool& pool,
+ WriterMetrics* metrics)
+ : BufferedOutputStream(pool, outStream, capacity, memoryBlockSize, metrics),
level(compressionLevel),
outputBuffer(nullptr),
bufferSize(0),
outputPosition(0),
- outputSize(0) {
+ outputSize(0),
+ compressionBlockSize(compressionBlockSize) {
// init header pointer array
header.fill(nullptr);
}
- void CompressionStreamBase::BackUp(int count) {
- if (count > bufferSize) {
- throw std::logic_error("Can't backup that much!");
- }
- bufferSize -= count;
- }
-
- uint64_t CompressionStreamBase::flush() {
- void* data;
- int size;
- if (!Next(&data, &size)) {
- throw std::runtime_error("Failed to flush compression buffer.");
- }
- BufferedOutputStream::BackUp(outputSize - outputPosition);
- bufferSize = outputSize = outputPosition = 0;
- return BufferedOutputStream::flush();
- }
-
- void CompressionStreamBase::suppress() {
- outputBuffer = nullptr;
- bufferSize = outputPosition = outputSize = 0;
- BufferedOutputStream::suppress();
- }
-
uint64_t CompressionStreamBase::getSize() const {
return BufferedOutputStream::getSize() - static_cast(outputSize - outputPosition);
}
@@ -149,12 +129,12 @@ namespace orc {
while (offset < size) {
if (outputPosition == outputSize) {
if (!BufferedOutputStream::Next(reinterpret_cast(&outputBuffer), &outputSize)) {
- throw std::runtime_error("Failed to get next output buffer from output stream.");
+ throw CompressionError("Failed to get next output buffer from output stream.");
}
outputPosition = 0;
} else if (outputPosition > outputSize) {
// for safety this will unlikely happen
- throw std::logic_error("Write to an out-of-bound place during compression!");
+ throw CompressionError("Write to an out-of-bound place during compression!");
}
int currentSize = std::min(outputSize - outputPosition, size - offset);
memcpy(outputBuffer + outputPosition, data + offset, static_cast(currentSize));
@@ -168,7 +148,7 @@ namespace orc {
for (uint32_t i = 0; i < HEADER_SIZE; ++i) {
if (outputPosition >= outputSize) {
if (!BufferedOutputStream::Next(reinterpret_cast(&outputBuffer), &outputSize)) {
- throw std::runtime_error("Failed to get next output buffer from output stream.");
+ throw CompressionError("Failed to get next output buffer from output stream.");
}
outputPosition = 0;
}
@@ -183,31 +163,74 @@ namespace orc {
class CompressionStream : public CompressionStreamBase {
public:
CompressionStream(OutputStream* outStream, int compressionLevel, uint64_t capacity,
- uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics);
+ uint64_t compressionBlockSize, uint64_t memoryBlockSize, MemoryPool& pool,
+ WriterMetrics* metrics);
virtual bool Next(void** data, int* size) override;
virtual std::string getName() const override = 0;
+ virtual void BackUp(int count) override;
+ virtual void suppress() override;
+ virtual uint64_t flush() override;
+ uint64_t getRawInputBufferSize() const override {
+ return rawInputBuffer.size();
+ }
+ virtual void finishStream() override {
+ compressInternal();
+ BufferedOutputStream::finishStream();
+ }
protected:
// return total compressed size
virtual uint64_t doStreamingCompression() = 0;
+
+ // Buffer to hold uncompressed data until user calls Next()
+ BlockBuffer rawInputBuffer;
+
+ void compressInternal();
};
+ void CompressionStream::BackUp(int count) {
+ uint64_t backup = static_cast(count);
+ uint64_t currSize = rawInputBuffer.size();
+ if (backup > currSize) {
+ throw CompressionError("Can't backup that much!");
+ }
+ rawInputBuffer.resize(currSize - backup);
+ }
+
+ uint64_t CompressionStream::flush() {
+ compressInternal();
+ BufferedOutputStream::BackUp(outputSize - outputPosition);
+ rawInputBuffer.resize(0);
+ outputSize = outputPosition = 0;
+ return BufferedOutputStream::flush();
+ }
+
+ void CompressionStream::suppress() {
+ outputBuffer = nullptr;
+ outputPosition = outputSize = 0;
+ rawInputBuffer.resize(0);
+ BufferedOutputStream::suppress();
+ }
+
CompressionStream::CompressionStream(OutputStream* outStream, int compressionLevel,
- uint64_t capacity, uint64_t blockSize, MemoryPool& pool,
+ uint64_t capacity, uint64_t compressionBlockSize,
+ uint64_t memoryBlockSize, MemoryPool& pool,
WriterMetrics* metrics)
- : CompressionStreamBase(outStream, compressionLevel, capacity, blockSize, pool, metrics) {
+ : CompressionStreamBase(outStream, compressionLevel, capacity, compressionBlockSize,
+ memoryBlockSize, pool, metrics),
+ rawInputBuffer(pool, memoryBlockSize) {
// PASS
}
- bool CompressionStream::Next(void** data, int* size) {
- if (bufferSize != 0) {
+ void CompressionStream::compressInternal() {
+ if (rawInputBuffer.size() != 0) {
ensureHeader();
uint64_t preSize = getSize();
uint64_t totalCompressedSize = doStreamingCompression();
- if (totalCompressedSize >= static_cast(bufferSize)) {
- writeHeader(static_cast(bufferSize), true);
+ if (totalCompressedSize >= static_cast(rawInputBuffer.size())) {
+ writeHeader(static_cast(rawInputBuffer.size()), true);
// reset output buffer
outputBuffer = nullptr;
outputPosition = outputSize = 0;
@@ -215,23 +238,42 @@ namespace orc {
BufferedOutputStream::BackUp(static_cast(backup));
// copy raw input buffer into block buffer
- writeData(rawInputBuffer.data(), bufferSize);
+ uint64_t blockNumber = rawInputBuffer.getBlockNumber();
+ for (uint64_t i = 0; i < blockNumber; ++i) {
+ auto block = rawInputBuffer.getBlock(i);
+ writeData(reinterpret_cast(block.data), block.size);
+ }
} else {
writeHeader(totalCompressedSize, false);
}
+ rawInputBuffer.resize(0);
}
+ }
- *data = rawInputBuffer.data();
- *size = static_cast(rawInputBuffer.size());
- bufferSize = *size;
+ bool CompressionStream::Next(void** data, int* size) {
+ if (rawInputBuffer.size() > compressionBlockSize) {
+ std::stringstream ss;
+ ss << "uncompressed data size " << rawInputBuffer.size()
+ << " is larger than compression block size " << compressionBlockSize;
+ throw CompressionError(ss.str());
+ }
+
+ // compress data in the rawInputBuffer when it is full
+ if (rawInputBuffer.size() == compressionBlockSize) {
+ compressInternal();
+ }
+ auto block = rawInputBuffer.getNextBlock();
+ *data = block.data;
+ *size = static_cast(block.size);
return true;
}
class ZlibCompressionStream : public CompressionStream {
public:
- ZlibCompressionStream(OutputStream* outStream, int compressionLevel, uint64_t capacity,
- uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics);
+ ZlibCompressionStream(OutputStream* outStream, int compressionLevel, uint64_t bufferCapacity,
+ uint64_t compressionBlockSize, uint64_t memoryBlockSize, MemoryPool& pool,
+ WriterMetrics* metrics);
virtual ~ZlibCompressionStream() override {
end();
@@ -249,42 +291,57 @@ namespace orc {
};
ZlibCompressionStream::ZlibCompressionStream(OutputStream* outStream, int compressionLevel,
- uint64_t capacity, uint64_t blockSize,
- MemoryPool& pool, WriterMetrics* metrics)
- : CompressionStream(outStream, compressionLevel, capacity, blockSize, pool, metrics) {
+ uint64_t bufferCapacity,
+ uint64_t compressionBlockSize,
+ uint64_t memoryBlockSize, MemoryPool& pool,
+ WriterMetrics* metrics)
+ : CompressionStream(outStream, compressionLevel, bufferCapacity, compressionBlockSize,
+ memoryBlockSize, pool, metrics) {
init();
}
uint64_t ZlibCompressionStream::doStreamingCompression() {
if (deflateReset(&strm_) != Z_OK) {
- throw std::runtime_error("Failed to reset inflate.");
+ throw CompressionError("Failed to reset inflate.");
}
- strm_.avail_in = static_cast(bufferSize);
- strm_.next_in = rawInputBuffer.data();
+ // iterate through all blocks
+ uint64_t blockId = 0;
+ bool finish = false;
do {
- if (outputPosition >= outputSize) {
- if (!BufferedOutputStream::Next(reinterpret_cast(&outputBuffer), &outputSize)) {
- throw std::runtime_error("Failed to get next output buffer from output stream.");
- }
- outputPosition = 0;
+ if (blockId == rawInputBuffer.getBlockNumber()) {
+ finish = true;
+ strm_.avail_in = 0;
+ strm_.next_in = nullptr;
+ } else {
+ auto block = rawInputBuffer.getBlock(blockId++);
+ strm_.avail_in = static_cast(block.size);
+ strm_.next_in = reinterpret_cast(block.data);
}
- strm_.next_out = reinterpret_cast(outputBuffer + outputPosition);
- strm_.avail_out = static_cast(outputSize - outputPosition);
- int ret = deflate(&strm_, Z_FINISH);
- outputPosition = outputSize - static_cast(strm_.avail_out);
+ do {
+ if (outputPosition >= outputSize) {
+ if (!BufferedOutputStream::Next(reinterpret_cast(&outputBuffer), &outputSize)) {
+ throw CompressionError("Failed to get next output buffer from output stream.");
+ }
+ outputPosition = 0;
+ }
+ strm_.next_out = reinterpret_cast(outputBuffer + outputPosition);
+ strm_.avail_out = static_cast(outputSize - outputPosition);
- if (ret == Z_STREAM_END) {
- break;
- } else if (ret == Z_OK) {
- // needs more buffer so will continue the loop
- } else {
- throw std::runtime_error("Failed to deflate input data.");
- }
- } while (strm_.avail_out == 0);
+ int ret = deflate(&strm_, finish ? Z_FINISH : Z_NO_FLUSH);
+ outputPosition = outputSize - static_cast(strm_.avail_out);
+ if (ret == Z_STREAM_END) {
+ break;
+ } else if (ret == Z_OK) {
+ // needs more buffer so will continue the loop
+ } else {
+ throw CompressionError("Failed to deflate input data.");
+ }
+ } while (strm_.avail_out == 0);
+ } while (!finish);
return strm_.total_out;
}
@@ -305,7 +362,7 @@ namespace orc {
strm_.next_in = nullptr;
if (deflateInit2(&strm_, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY) != Z_OK) {
- throw std::runtime_error("Error while calling deflateInit2() for zlib.");
+ throw CompressionError("Error while calling deflateInit2() for zlib.");
}
}
@@ -505,7 +562,7 @@ namespace orc {
} else if (state == DECOMPRESS_START) {
NextDecompress(data, size, availableSize);
} else {
- throw std::logic_error(
+ throw CompressionError(
"Unknown compression state in "
"DecompressionStream::Next");
}
@@ -519,7 +576,7 @@ namespace orc {
void DecompressionStream::BackUp(int count) {
if (outputBuffer == nullptr || outputBufferLength != 0) {
- throw std::logic_error("Backup without previous Next in " + getName());
+ throw CompressionError("Backup without previous Next in " + getName());
}
outputBuffer -= static_cast(count);
outputBufferLength = static_cast(count);
@@ -647,13 +704,17 @@ namespace orc {
case Z_OK:
break;
case Z_MEM_ERROR:
- throw std::logic_error("Memory error from inflateInit2");
+ throw CompressionError(
+ "Memory error from ZlibDecompressionStream::ZlibDecompressionStream inflateInit2");
case Z_VERSION_ERROR:
- throw std::logic_error("Version error from inflateInit2");
+ throw CompressionError(
+ "Version error from ZlibDecompressionStream::ZlibDecompressionStream inflateInit2");
case Z_STREAM_ERROR:
- throw std::logic_error("Stream error from inflateInit2");
+ throw CompressionError(
+ "Stream error from ZlibDecompressionStream::ZlibDecompressionStream inflateInit2");
default:
- throw std::logic_error("Unknown error from inflateInit2");
+ throw CompressionError(
+ "Unknown error from ZlibDecompressionStream::ZlibDecompressionStream inflateInit2");
}
}
@@ -674,7 +735,7 @@ namespace orc {
zstream_.next_out = reinterpret_cast(const_cast(outputBuffer));
zstream_.avail_out = static_cast(outputDataBuffer.capacity());
if (inflateReset(&zstream_) != Z_OK) {
- throw std::logic_error(
+ throw CompressionError(
"Bad inflateReset in "
"ZlibDecompressionStream::NextDecompress");
}
@@ -694,19 +755,19 @@ namespace orc {
case Z_STREAM_END:
break;
case Z_BUF_ERROR:
- throw std::logic_error(
+ throw CompressionError(
"Buffer error in "
"ZlibDecompressionStream::NextDecompress");
case Z_DATA_ERROR:
- throw std::logic_error(
+ throw CompressionError(
"Data error in "
"ZlibDecompressionStream::NextDecompress");
case Z_STREAM_ERROR:
- throw std::logic_error(
+ throw CompressionError(
"Stream error in "
"ZlibDecompressionStream::NextDecompress");
default:
- throw std::logic_error(
+ throw CompressionError(
"Unknown error in "
"ZlibDecompressionStream::NextDecompress");
}
@@ -812,7 +873,7 @@ namespace orc {
}
if (outLength > maxOutputLength) {
- throw std::logic_error("Snappy length exceeds block size");
+ throw CompressionError("Snappy length exceeds block size");
}
if (!snappy::RawUncompress(input, length, output)) {
@@ -881,14 +942,23 @@ namespace orc {
public:
BlockCompressionStream(OutputStream* outStream, int compressionLevel, uint64_t capacity,
uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics)
- : CompressionStreamBase(outStream, compressionLevel, capacity, blockSize, pool, metrics),
- compressorBuffer(pool) {
+ : CompressionStreamBase(outStream, compressionLevel, capacity, blockSize, blockSize, pool,
+ metrics),
+ compressorBuffer(pool),
+ rawInputBuffer(pool, blockSize) {
// PASS
}
virtual bool Next(void** data, int* size) override;
virtual void suppress() override;
+ virtual void BackUp(int count) override;
+ virtual uint64_t flush() override;
virtual std::string getName() const override = 0;
+ uint64_t getRawInputBufferSize() const override {
+ return bufferSize;
+ }
+
+ virtual void finishStream() override;
protected:
// compresses a block and returns the compressed size
@@ -900,8 +970,23 @@ namespace orc {
// should allocate max possible compressed size
DataBuffer compressorBuffer;
+
+ // Buffer to hold uncompressed data until user calls Next()
+ DataBuffer rawInputBuffer;
};
+ void BlockCompressionStream::BackUp(int count) {
+ if (count > bufferSize) {
+ throw CompressionError("Can't backup that much!");
+ }
+ bufferSize -= count;
+ }
+
+ uint64_t BlockCompressionStream::flush() {
+ finishStream();
+ return BufferedOutputStream::flush();
+ }
+
bool BlockCompressionStream::Next(void** data, int* size) {
if (bufferSize != 0) {
ensureHeader();
@@ -935,7 +1020,19 @@ namespace orc {
void BlockCompressionStream::suppress() {
compressorBuffer.resize(0);
- CompressionStreamBase::suppress();
+ outputBuffer = nullptr;
+ bufferSize = outputPosition = outputSize = 0;
+ BufferedOutputStream::suppress();
+ }
+
+ void BlockCompressionStream::finishStream() {
+ void* data;
+ int size;
+ if (!Next(&data, &size)) {
+ throw CompressionError("Failed to flush compression buffer.");
+ }
+ BufferedOutputStream::BackUp(outputSize - outputPosition);
+ bufferSize = outputSize = outputPosition = 0;
}
/**
@@ -976,7 +1073,7 @@ namespace orc {
reinterpret_cast(compressorBuffer.data()), bufferSize,
static_cast(compressorBuffer.size()), level);
if (result == 0) {
- throw std::runtime_error("Error during block compression using lz4.");
+ throw CompressionError("Error during block compression using lz4.");
}
return static_cast(result);
}
@@ -984,7 +1081,7 @@ namespace orc {
void Lz4CompressionSteam::init() {
state_ = LZ4_createStream();
if (!state_) {
- throw std::runtime_error("Error while allocating state for lz4.");
+ throw CompressionError("Error while allocating state for lz4.");
}
}
@@ -1072,7 +1169,7 @@ namespace orc {
void ZSTDCompressionStream::init() {
cctx_ = ZSTD_createCCtx();
if (!cctx_) {
- throw std::runtime_error("Error while calling ZSTD_createCCtx() for zstd.");
+ throw CompressionError("Error while calling ZSTD_createCCtx() for zstd.");
}
}
@@ -1129,7 +1226,7 @@ namespace orc {
void ZSTDDecompressionStream::init() {
dctx_ = ZSTD_createDCtx();
if (!dctx_) {
- throw std::runtime_error("Error while calling ZSTD_createDCtx() for zstd.");
+ throw CompressionError("Error while calling ZSTD_createDCtx() for zstd.");
}
}
@@ -1140,12 +1237,10 @@ namespace orc {
DIAGNOSTIC_PUSH
- std::unique_ptr createCompressor(CompressionKind kind,
- OutputStream* outStream,
- CompressionStrategy strategy,
- uint64_t bufferCapacity,
- uint64_t compressionBlockSize,
- MemoryPool& pool, WriterMetrics* metrics) {
+ std::unique_ptr createCompressor(
+ CompressionKind kind, OutputStream* outStream, CompressionStrategy strategy,
+ uint64_t bufferCapacity, uint64_t compressionBlockSize, uint64_t memoryBlockSize,
+ MemoryPool& pool, WriterMetrics* metrics) {
switch (static_cast(kind)) {
case CompressionKind_NONE: {
return std::make_unique(pool, outStream, bufferCapacity,
@@ -1154,8 +1249,8 @@ namespace orc {
case CompressionKind_ZLIB: {
int level =
(strategy == CompressionStrategy_SPEED) ? Z_BEST_SPEED + 1 : Z_DEFAULT_COMPRESSION;
- return std::make_unique(outStream, level, bufferCapacity,
- compressionBlockSize, pool, metrics);
+ return std::make_unique(
+ outStream, level, bufferCapacity, compressionBlockSize, memoryBlockSize, pool, metrics);
}
case CompressionKind_ZSTD: {
int level = (strategy == CompressionStrategy_SPEED) ? 1 : ZSTD_CLEVEL_DEFAULT;
diff --git a/c++/src/Compression.hh b/c++/src/Compression.hh
index 55b152dd63..24170c56b4 100644
--- a/c++/src/Compression.hh
+++ b/c++/src/Compression.hh
@@ -42,15 +42,16 @@ namespace orc {
* @param outStream the output stream that is the underlying target
* @param strategy compression strategy
* @param bufferCapacity compression stream buffer total capacity
- * @param compressionBlockSize compression buffer block size
+ * @param compressionBlockSize compression is triggered when the original input buffer size
+ * reaches this size
+ * @param memoryBlockSize the block size for original input buffer
* @param pool the memory pool
+ * @param metrics the writer metrics
*/
- std::unique_ptr createCompressor(CompressionKind kind,
- OutputStream* outStream,
- CompressionStrategy strategy,
- uint64_t bufferCapacity,
- uint64_t compressionBlockSize,
- MemoryPool& pool, WriterMetrics* metrics);
+ std::unique_ptr createCompressor(
+ CompressionKind kind, OutputStream* outStream, CompressionStrategy strategy,
+ uint64_t bufferCapacity, uint64_t compressionBlockSize, uint64_t memoryBlockSize,
+ MemoryPool& pool, WriterMetrics* metrics);
} // namespace orc
#endif
diff --git a/c++/src/ConvertColumnReader.cc b/c++/src/ConvertColumnReader.cc
index 67ee6d6c45..7db5b88954 100644
--- a/c++/src/ConvertColumnReader.cc
+++ b/c++/src/ConvertColumnReader.cc
@@ -17,6 +17,9 @@
*/
#include "ConvertColumnReader.hh"
+#include "Utils.hh"
+
+#include
namespace orc {
@@ -72,6 +75,23 @@ namespace orc {
}
}
+ static inline void handleParseFromStringError(ColumnVectorBatch& dstBatch, uint64_t idx,
+ bool shouldThrow, const std::string& typeName,
+ const std::string& str,
+ const std::string& expectedFormat = "") {
+ if (!shouldThrow) {
+ dstBatch.notNull.data()[idx] = 0;
+ dstBatch.hasNulls = true;
+ } else {
+ std::ostringstream ss;
+ ss << "Failed to parse " << typeName << " from string:" << str;
+ if (expectedFormat != "") {
+ ss << " the following format \"" << expectedFormat << "\" is expected";
+ }
+ throw SchemaEvolutionError(ss.str());
+ }
+ }
+
// return false if overflow
template
static bool downCastToInteger(ReadType& dstValue, int64_t inputLong) {
@@ -106,13 +126,13 @@ namespace orc {
bool shouldThrow) {
constexpr bool isFileTypeFloatingPoint(std::is_floating_point::value);
constexpr bool isReadTypeFloatingPoint(std::is_floating_point::value);
- int64_t longValue = static_cast(srcValue);
+
if (isFileTypeFloatingPoint) {
if (isReadTypeFloatingPoint) {
destValue = static_cast(srcValue);
} else {
if (!canFitInLong(static_cast(srcValue)) ||
- !downCastToInteger(destValue, longValue)) {
+ !downCastToInteger(destValue, static_cast(srcValue))) {
handleOverflow(destBatch, idx, shouldThrow);
}
}
@@ -399,13 +419,14 @@ namespace orc {
ConvertToTimestampColumnReader(const Type& readType, const Type& fileType,
StripeStreams& stripe, bool throwOnOverflow)
: ConvertColumnReader(readType, fileType, stripe, throwOnOverflow),
- readerTimezone(readType.getKind() == TIMESTAMP_INSTANT ? &getTimezoneByName("GMT")
- : &stripe.getReaderTimezone()),
+ isInstant(readType.getKind() == TIMESTAMP_INSTANT),
+ readerTimezone(isInstant ? &getTimezoneByName("GMT") : &stripe.getReaderTimezone()),
needConvertTimezone(readerTimezone != &getTimezoneByName("GMT")) {}
void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
protected:
+ const bool isInstant;
const orc::Timezone* readerTimezone;
const bool needConvertTimezone;
};
@@ -558,6 +579,8 @@ namespace orc {
const auto& srcBatch = *SafeCastBatchTo(data.get());
auto& dstBatch = *SafeCastBatchTo(&rowBatch);
+ dstBatch.precision = toPrecision_;
+ dstBatch.scale = toScale_;
for (uint64_t i = 0; i < numValues; ++i) {
if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
convertDecimalToDecimal(dstBatch, i, srcBatch);
@@ -694,6 +717,318 @@ namespace orc {
const int32_t scale_;
};
+ template
+ class StringVariantToNumericColumnReader : public ConvertColumnReader {
+ public:
+ StringVariantToNumericColumnReader(const Type& readType, const Type& fileType,
+ StripeStreams& stripe, bool throwOnOverflow)
+ : ConvertColumnReader(readType, fileType, stripe, throwOnOverflow) {}
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+ ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+ const auto& srcBatch = *SafeCastBatchTo(data.get());
+ auto& dstBatch = *SafeCastBatchTo(&rowBatch);
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ if constexpr (std::is_floating_point_v) {
+ convertToDouble(dstBatch, srcBatch, i);
+ } else {
+ convertToInteger(dstBatch, srcBatch, i);
+ }
+ }
+ }
+ }
+
+ private:
+ void convertToInteger(ReadTypeBatch& dstBatch, const StringVectorBatch& srcBatch,
+ uint64_t idx) {
+ int64_t longValue = 0;
+ const std::string longStr(srcBatch.data[idx], srcBatch.length[idx]);
+ try {
+ longValue = std::stoll(longStr);
+ } catch (...) {
+ handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Long", longStr);
+ return;
+ }
+ if constexpr (std::is_same_v) {
+ dstBatch.data[idx] = longValue == 0 ? 0 : 1;
+ } else {
+ if (!downCastToInteger(dstBatch.data[idx], longValue)) {
+ handleOverflow(dstBatch, idx, throwOnOverflow);
+ }
+ }
+ }
+
+ void convertToDouble(ReadTypeBatch& dstBatch, const StringVectorBatch& srcBatch, uint64_t idx) {
+ const std::string floatValue(srcBatch.data[idx], srcBatch.length[idx]);
+ try {
+ if constexpr (std::is_same_v) {
+ dstBatch.data[idx] = std::stof(floatValue);
+ } else {
+ dstBatch.data[idx] = std::stod(floatValue);
+ }
+ } catch (...) {
+ handleParseFromStringError(dstBatch, idx, throwOnOverflow, typeid(readType).name(),
+ floatValue);
+ }
+ }
+ };
+
+ class StringVariantConvertColumnReader : public ConvertToStringVariantColumnReader {
+ public:
+ StringVariantConvertColumnReader(const Type& readType, const Type& fileType,
+ StripeStreams& stripe, bool throwOnOverflow)
+ : ConvertToStringVariantColumnReader(readType, fileType, stripe, throwOnOverflow) {}
+
+ uint64_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override {
+ uint64_t size = 0;
+ strBuffer.resize(numValues);
+ const auto& srcBatch = *SafeCastBatchTo(data.get());
+ const auto maxLength = readType.getMaximumLength();
+ if (readType.getKind() == STRING) {
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ strBuffer[i] = std::string(srcBatch.data[i], srcBatch.length[i]);
+ size += strBuffer[i].size();
+ }
+ }
+ } else if (readType.getKind() == VARCHAR) {
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ const char* charData = srcBatch.data[i];
+ uint64_t originLength = srcBatch.length[i];
+ uint64_t itemLength = Utf8Utils::truncateBytesTo(maxLength, charData, originLength);
+ strBuffer[i] = std::string(charData, itemLength);
+ size += strBuffer[i].length();
+ }
+ }
+ } else if (readType.getKind() == CHAR) {
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ const char* charData = srcBatch.data[i];
+ uint64_t originLength = srcBatch.length[i];
+ uint64_t charLength = Utf8Utils::charLength(charData, originLength);
+ auto itemLength = Utf8Utils::truncateBytesTo(maxLength, charData, originLength);
+ strBuffer[i] = std::string(srcBatch.data[i], itemLength);
+ // the padding is exactly 1 byte per char
+ if (charLength < maxLength) {
+ strBuffer[i].resize(itemLength + maxLength - charLength, ' ');
+ }
+ size += strBuffer[i].length();
+ }
+ }
+ } else {
+ throw SchemaEvolutionError("Invalid type for numeric to string conversion: " +
+ readType.toString());
+ }
+ return size;
+ }
+ };
+
+ class StringVariantToTimestampColumnReader : public ConvertToTimestampColumnReader {
+ public:
+ StringVariantToTimestampColumnReader(const Type& readType, const Type& fileType,
+ StripeStreams& stripe, bool throwOnOverflow)
+ : ConvertToTimestampColumnReader(readType, fileType, stripe, throwOnOverflow) {}
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+ ConvertToTimestampColumnReader::next(rowBatch, numValues, notNull);
+
+ const auto& srcBatch = *SafeCastBatchTo(data.get());
+ auto& dstBatch = *SafeCastBatchTo(&rowBatch);
+
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ convertToTimestamp(dstBatch, i, std::string(srcBatch.data[i], srcBatch.length[i]));
+ }
+ }
+ }
+
+ private:
+ // Algorithm: http://howardhinnant.github.io/date_algorithms.html
+ // The algorithm implements a proleptic Gregorian calendar.
+ int64_t daysFromProlepticGregorianCalendar(int32_t y, int32_t m, int32_t d) {
+ y -= m <= 2;
+ int32_t era = y / 400;
+ int32_t yoe = y - era * 400; // [0, 399]
+ int32_t doy = (153 * (m + (m > 2 ? -3 : 9)) + 2) / 5 + d - 1; // [0, 365]
+ int32_t doe = yoe * 365 + yoe / 4 - yoe / 100 + doy; // [0, 146096]
+ return 1ll * era * 146097 + doe - 719468;
+ }
+
+ std::optional> tryBestToParseFromString(
+ const std::string& timeStr) {
+ int32_t year, month, day, hour, min, sec, nanos = 0;
+ int32_t matched = std::sscanf(timeStr.c_str(), "%4d-%2d-%2d %2d:%2d:%2d.%d", &year, &month,
+ &day, &hour, &min, &sec, &nanos);
+ if (matched != 6 && matched != 7) {
+ return std::nullopt;
+ }
+ if (nanos) {
+ if (nanos < 0 || nanos >= 1e9) {
+ return std::nullopt;
+ }
+ while (nanos < static_cast(1e8)) {
+ nanos *= 10;
+ }
+ }
+ int64_t daysSinceEpoch = daysFromProlepticGregorianCalendar(year, month, day);
+ int64_t secondSinceEpoch = 60ll * (60 * (24L * daysSinceEpoch + hour) + min) + sec;
+ return std::make_optional(std::pair{secondSinceEpoch, nanos});
+ }
+
+ void convertToTimestamp(TimestampVectorBatch& dstBatch, uint64_t idx,
+ const std::string& timeStr) {
+ // Expected timestamp_instant format string : yyyy-mm-dd hh:mm:ss[.xxx] timezone
+ // Eg. "2019-07-09 13:11:00 America/Los_Angeles"
+ // Expected timestamp format string : yyyy-mm-dd hh:mm:ss[.xxx]
+ // Eg. "2019-07-09 13:11:00"
+ static std::string expectedTimestampInstantFormat = "yyyy-mm-dd hh:mm:ss[.xxx] timezone";
+ static std::string expectedTimestampFormat = "yyyy-mm-dd hh:mm:ss[.xxx]";
+ auto timestamp = tryBestToParseFromString(timeStr);
+ if (!timestamp.has_value()) {
+ if (!isInstant) {
+ handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Timestamp", timeStr,
+ expectedTimestampFormat);
+ return;
+ }
+ handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Timestamp_Instant", timeStr,
+ expectedTimestampInstantFormat);
+ return;
+ }
+
+ auto& [second, nanos] = timestamp.value();
+
+ if (isInstant) {
+ size_t pos = 0; // get the name of timezone
+ pos = timeStr.find(' ', pos) + 1;
+ pos = timeStr.find(' ', pos);
+ if (pos == std::string::npos) {
+ handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Timestamp_Instant", timeStr,
+ expectedTimestampInstantFormat);
+ return;
+ }
+ pos += 1;
+ size_t subStrLength = timeStr.length() - pos;
+ try {
+ second = getTimezoneByName(timeStr.substr(pos, subStrLength)).convertFromUTC(second);
+ } catch (const TimezoneError&) {
+ handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Timestamp_Instant", timeStr,
+ expectedTimestampInstantFormat);
+ return;
+ }
+ } else {
+ if (needConvertTimezone) {
+ second = readerTimezone->convertFromUTC(second);
+ }
+ }
+ dstBatch.data[idx] = second;
+ dstBatch.nanoseconds[idx] = nanos;
+ }
+ };
+
+ template
+ class StringVariantToDecimalColumnReader : public ConvertColumnReader {
+ public:
+ StringVariantToDecimalColumnReader(const Type& readType, const Type& fileType,
+ StripeStreams& stripe, bool throwOnOverflow)
+ : ConvertColumnReader(readType, fileType, stripe, throwOnOverflow),
+ precision_(static_cast(readType.getPrecision())),
+ scale_(static_cast(readType.getScale())) {}
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+ ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+ const auto& srcBatch = *SafeCastBatchTo(data.get());
+ auto& dstBatch = *SafeCastBatchTo(&rowBatch);
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ convertToDecimal(dstBatch, i, std::string(srcBatch.data[i], srcBatch.length[i]));
+ }
+ }
+ }
+
+ private:
+ void convertToDecimal(ReadTypeBatch& dstBatch, uint64_t idx, const std::string& decimalStr) {
+ constexpr int32_t MAX_PRECISION_128 = 38;
+ int32_t fromPrecision = 0;
+ int32_t fromScale = 0;
+ uint32_t start = 0;
+ bool negative = false;
+ if (decimalStr.empty()) {
+ handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Decimal", decimalStr);
+ return;
+ }
+ auto dotPos = decimalStr.find('.');
+ if (dotPos == std::string::npos) {
+ fromScale = 0;
+ fromPrecision = decimalStr.length();
+ dotPos = decimalStr.length();
+ } else {
+ if (dotPos + 1 == decimalStr.length()) {
+ handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Decimal", decimalStr);
+ return;
+ }
+ fromPrecision = decimalStr.length() - 1;
+ fromScale = decimalStr.length() - dotPos - 1;
+ }
+ if (decimalStr.front() == '-') {
+ negative = true;
+ start++;
+ fromPrecision--;
+ }
+ const std::string integerPortion = decimalStr.substr(start, dotPos - start);
+ if (dotPos == start || fromPrecision > MAX_PRECISION_128 || fromPrecision <= 0 ||
+ !std::all_of(integerPortion.begin(), integerPortion.end(), ::isdigit)) {
+ handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Decimal", decimalStr);
+ return;
+ }
+
+ Int128 i128;
+ try {
+ bool overflow = false;
+ i128 = Int128(integerPortion);
+ // overflow won't happen
+ i128 *= scaleUpInt128ByPowerOfTen(Int128(1), fromScale, overflow);
+ } catch (const std::exception& e) {
+ handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Decimal", decimalStr);
+ return;
+ }
+ if (dotPos + 1 < decimalStr.length()) {
+ const std::string fractionPortion = decimalStr.substr(dotPos + 1, fromScale);
+ if (!std::all_of(fractionPortion.begin(), fractionPortion.end(), ::isdigit)) {
+ handleOverflow(dstBatch, idx, throwOnOverflow);
+ return;
+ }
+ i128 += Int128(fractionPortion);
+ }
+
+ auto [overflow, result] = convertDecimal(i128, fromScale, precision_, scale_);
+ if (overflow) {
+ handleOverflow(dstBatch, idx, throwOnOverflow);
+ return;
+ }
+ if (negative) {
+ result.negate();
+ }
+
+ if constexpr (std::is_same_v) {
+ dstBatch.values[idx] = result;
+ } else {
+ if (!result.fitsInLong()) {
+ handleOverflow(dstBatch, idx,
+ throwOnOverflow);
+ } else {
+ dstBatch.values[idx] = result.toLong();
+ }
+ }
+ }
+
+ const int32_t precision_;
+ const int32_t scale_;
+ };
+
#define DEFINE_NUMERIC_CONVERT_READER(FROM, TO, TYPE) \
using FROM##To##TO##ColumnReader = \
NumericConvertColumnReader;
@@ -730,6 +1065,18 @@ namespace orc {
using Decimal64To##TO##ColumnReader = DecimalToStringVariantColumnReader; \
using Decimal128To##TO##ColumnReader = DecimalToStringVariantColumnReader;
+#define DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(FROM, TO, TYPE) \
+ using FROM##To##TO##ColumnReader = StringVariantToNumericColumnReader;
+
+#define DEFINE_STRING_VARIANT_CONVERT_READER(FROM, TO) \
+ using FROM##To##TO##ColumnReader = StringVariantConvertColumnReader;
+
+#define DEFINE_STRING_VARIANT_CONVERT_TO_TIMESTAMP_READER(FROM, TO) \
+ using FROM##To##TO##ColumnReader = StringVariantToTimestampColumnReader;
+
+#define DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(FROM, TO) \
+ using FROM##To##TO##ColumnReader = StringVariantToDecimalColumnReader;
+
DEFINE_NUMERIC_CONVERT_READER(Boolean, Byte, int8_t)
DEFINE_NUMERIC_CONVERT_READER(Boolean, Short, int16_t)
DEFINE_NUMERIC_CONVERT_READER(Boolean, Int, int32_t)
@@ -834,8 +1181,57 @@ namespace orc {
DEFINE_DECIMAL_CONVERT_TO_STRING_VARINT_READER(Char)
DEFINE_DECIMAL_CONVERT_TO_STRING_VARINT_READER(Varchar)
+ // String variant to numeric
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Boolean, bool)
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Byte, int8_t)
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Short, int16_t)
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Int, int32_t)
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Long, int64_t)
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Float, float)
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Double, double)
+
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Boolean, bool)
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Byte, int8_t)
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Short, int16_t)
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Int, int32_t)
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Long, int64_t)
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Float, float)
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Double, double)
+
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Boolean, bool)
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Byte, int8_t)
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Short, int16_t)
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Int, int32_t)
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Long, int64_t)
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Float, float)
+ DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Double, double)
+
+ // String variant to string variant
+ DEFINE_STRING_VARIANT_CONVERT_READER(String, String)
+ DEFINE_STRING_VARIANT_CONVERT_READER(String, Char)
+ DEFINE_STRING_VARIANT_CONVERT_READER(String, Varchar)
+ DEFINE_STRING_VARIANT_CONVERT_READER(Char, Char)
+ DEFINE_STRING_VARIANT_CONVERT_READER(Char, String)
+ DEFINE_STRING_VARIANT_CONVERT_READER(Char, Varchar)
+ DEFINE_STRING_VARIANT_CONVERT_READER(Varchar, String)
+ DEFINE_STRING_VARIANT_CONVERT_READER(Varchar, Char)
+ DEFINE_STRING_VARIANT_CONVERT_READER(Varchar, Varchar)
+
+ // String variant to timestamp
+ DEFINE_STRING_VARIANT_CONVERT_TO_TIMESTAMP_READER(String, Timestamp)
+ DEFINE_STRING_VARIANT_CONVERT_TO_TIMESTAMP_READER(Char, Timestamp)
+ DEFINE_STRING_VARIANT_CONVERT_TO_TIMESTAMP_READER(Varchar, Timestamp)
+
+ // String variant to decimal
+ DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(String, Decimal64)
+ DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(String, Decimal128)
+ DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(Char, Decimal64)
+ DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(Char, Decimal128)
+ DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(Varchar, Decimal64)
+ DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(Varchar, Decimal128)
+
#define CREATE_READER(NAME) \
- return std::make_unique(_readType, fileType, stripe, throwOnOverflow);
+ return std::make_unique(readType, fileType, stripe, throwOnOverflow);
#define CASE_CREATE_READER(TYPE, CONVERT) \
case TYPE: \
@@ -858,7 +1254,7 @@ namespace orc {
#define CASE_CREATE_DECIMAL_READER(FROM) \
case DECIMAL: { \
- if (isDecimal64(_readType)) { \
+ if (isDecimal64(readType)) { \
CREATE_READER(FROM##ToDecimal64ColumnReader) \
} else { \
CREATE_READER(FROM##ToDecimal128ColumnReader) \
@@ -868,7 +1264,7 @@ namespace orc {
#define CASE_EXCEPTION \
default: \
throw SchemaEvolutionError("Cannot convert from " + fileType.toString() + " to " + \
- _readType.toString());
+ readType.toString());
std::unique_ptr buildConvertReader(const Type& fileType, StripeStreams& stripe,
bool useTightNumericVector,
@@ -878,11 +1274,11 @@ namespace orc {
"SchemaEvolution only support tight vector, please create ColumnVectorBatch with "
"option useTightNumericVector");
}
- const auto& _readType = *stripe.getSchemaEvolution()->getReadType(fileType);
+ const auto& readType = *stripe.getSchemaEvolution()->getReadType(fileType);
switch (fileType.getKind()) {
case BOOLEAN: {
- switch (_readType.getKind()) {
+ switch (readType.getKind()) {
CASE_CREATE_READER(BYTE, BooleanToByte)
CASE_CREATE_READER(SHORT, BooleanToShort)
CASE_CREATE_READER(INT, BooleanToInt)
@@ -906,7 +1302,7 @@ namespace orc {
}
}
case BYTE: {
- switch (_readType.getKind()) {
+ switch (readType.getKind()) {
CASE_CREATE_READER(BOOLEAN, ByteToBoolean)
CASE_CREATE_READER(SHORT, ByteToShort)
CASE_CREATE_READER(INT, ByteToInt)
@@ -930,7 +1326,7 @@ namespace orc {
}
}
case SHORT: {
- switch (_readType.getKind()) {
+ switch (readType.getKind()) {
CASE_CREATE_READER(BOOLEAN, ShortToBoolean)
CASE_CREATE_READER(BYTE, ShortToByte)
CASE_CREATE_READER(INT, ShortToInt)
@@ -954,7 +1350,7 @@ namespace orc {
}
}
case INT: {
- switch (_readType.getKind()) {
+ switch (readType.getKind()) {
CASE_CREATE_READER(BOOLEAN, IntToBoolean)
CASE_CREATE_READER(BYTE, IntToByte)
CASE_CREATE_READER(SHORT, IntToShort)
@@ -978,7 +1374,7 @@ namespace orc {
}
}
case LONG: {
- switch (_readType.getKind()) {
+ switch (readType.getKind()) {
CASE_CREATE_READER(BOOLEAN, LongToBoolean)
CASE_CREATE_READER(BYTE, LongToByte)
CASE_CREATE_READER(SHORT, LongToShort)
@@ -1002,7 +1398,7 @@ namespace orc {
}
}
case FLOAT: {
- switch (_readType.getKind()) {
+ switch (readType.getKind()) {
CASE_CREATE_READER(BOOLEAN, FloatToBoolean)
CASE_CREATE_READER(BYTE, FloatToByte)
CASE_CREATE_READER(SHORT, FloatToShort)
@@ -1026,7 +1422,7 @@ namespace orc {
}
}
case DOUBLE: {
- switch (_readType.getKind()) {
+ switch (readType.getKind()) {
CASE_CREATE_READER(BOOLEAN, DoubleToBoolean)
CASE_CREATE_READER(BYTE, DoubleToByte)
CASE_CREATE_READER(SHORT, DoubleToShort)
@@ -1050,7 +1446,7 @@ namespace orc {
}
}
case DECIMAL: {
- switch (_readType.getKind()) {
+ switch (readType.getKind()) {
CASE_CREATE_FROM_DECIMAL_READER(BOOLEAN, Boolean)
CASE_CREATE_FROM_DECIMAL_READER(BYTE, Byte)
CASE_CREATE_FROM_DECIMAL_READER(SHORT, Short)
@@ -1065,13 +1461,13 @@ namespace orc {
CASE_CREATE_FROM_DECIMAL_READER(TIMESTAMP_INSTANT, Timestamp)
case DECIMAL: {
if (isDecimal64(fileType)) {
- if (isDecimal64(_readType)) {
+ if (isDecimal64(readType)) {
CREATE_READER(Decimal64ToDecimal64ColumnReader)
} else {
CREATE_READER(Decimal64ToDecimal128ColumnReader)
}
} else {
- if (isDecimal64(_readType)) {
+ if (isDecimal64(readType)) {
CREATE_READER(Decimal128ToDecimal64ColumnReader)
} else {
CREATE_READER(Decimal128ToDecimal128ColumnReader)
@@ -1087,7 +1483,96 @@ namespace orc {
CASE_EXCEPTION
}
}
- case STRING:
+ case STRING: {
+ switch (readType.getKind()) {
+ CASE_CREATE_READER(BOOLEAN, StringToBoolean)
+ CASE_CREATE_READER(BYTE, StringToByte)
+ CASE_CREATE_READER(SHORT, StringToShort)
+ CASE_CREATE_READER(INT, StringToInt)
+ CASE_CREATE_READER(LONG, StringToLong)
+ CASE_CREATE_READER(FLOAT, StringToFloat)
+ CASE_CREATE_READER(DOUBLE, StringToDouble)
+ CASE_CREATE_READER(STRING, StringToString)
+ CASE_CREATE_READER(CHAR, StringToChar)
+ CASE_CREATE_READER(VARCHAR, StringToVarchar)
+ CASE_CREATE_READER(TIMESTAMP, StringToTimestamp)
+ CASE_CREATE_READER(TIMESTAMP_INSTANT, StringToTimestamp)
+ case DECIMAL: {
+ if (isDecimal64(readType)) {
+ CREATE_READER(StringToDecimal64ColumnReader)
+ } else {
+ CREATE_READER(StringToDecimal128ColumnReader)
+ }
+ }
+ case BINARY:
+ case LIST:
+ case MAP:
+ case STRUCT:
+ case UNION:
+ case DATE:
+ CASE_EXCEPTION
+ }
+ }
+ case CHAR: {
+ switch (readType.getKind()) {
+ CASE_CREATE_READER(BOOLEAN, CharToBoolean)
+ CASE_CREATE_READER(BYTE, CharToByte)
+ CASE_CREATE_READER(SHORT, CharToShort)
+ CASE_CREATE_READER(INT, CharToInt)
+ CASE_CREATE_READER(LONG, CharToLong)
+ CASE_CREATE_READER(FLOAT, CharToFloat)
+ CASE_CREATE_READER(DOUBLE, CharToDouble)
+ CASE_CREATE_READER(STRING, CharToString)
+ CASE_CREATE_READER(CHAR, CharToChar)
+ CASE_CREATE_READER(VARCHAR, CharToVarchar)
+ CASE_CREATE_READER(TIMESTAMP, CharToTimestamp)
+ CASE_CREATE_READER(TIMESTAMP_INSTANT, CharToTimestamp)
+ case DECIMAL: {
+ if (isDecimal64(readType)) {
+ CREATE_READER(CharToDecimal64ColumnReader)
+ } else {
+ CREATE_READER(CharToDecimal128ColumnReader)
+ }
+ }
+ case BINARY:
+ case LIST:
+ case MAP:
+ case STRUCT:
+ case UNION:
+ case DATE:
+ CASE_EXCEPTION
+ }
+ }
+ case VARCHAR: {
+ switch (readType.getKind()) {
+ CASE_CREATE_READER(BOOLEAN, VarcharToBoolean)
+ CASE_CREATE_READER(BYTE, VarcharToByte)
+ CASE_CREATE_READER(SHORT, VarcharToShort)
+ CASE_CREATE_READER(INT, VarcharToInt)
+ CASE_CREATE_READER(LONG, VarcharToLong)
+ CASE_CREATE_READER(FLOAT, VarcharToFloat)
+ CASE_CREATE_READER(DOUBLE, VarcharToDouble)
+ CASE_CREATE_READER(STRING, VarcharToString)
+ CASE_CREATE_READER(CHAR, VarcharToChar)
+ CASE_CREATE_READER(VARCHAR, VarcharToVarchar)
+ CASE_CREATE_READER(TIMESTAMP, VarcharToTimestamp)
+ CASE_CREATE_READER(TIMESTAMP_INSTANT, VarcharToTimestamp)
+ case DECIMAL: {
+ if (isDecimal64(readType)) {
+ CREATE_READER(VarcharToDecimal64ColumnReader)
+ } else {
+ CREATE_READER(VarcharToDecimal128ColumnReader)
+ }
+ }
+ case BINARY:
+ case LIST:
+ case MAP:
+ case STRUCT:
+ case UNION:
+ case DATE:
+ CASE_EXCEPTION
+ }
+ }
case BINARY:
case TIMESTAMP:
case LIST:
@@ -1095,21 +1580,9 @@ namespace orc {
case STRUCT:
case UNION:
case DATE:
- case VARCHAR:
- case CHAR:
case TIMESTAMP_INSTANT:
CASE_EXCEPTION
}
}
-#undef DEFINE_NUMERIC_CONVERT_READER
-#undef DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER
-#undef DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER
-#undef DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER
-#undef DEFINE_DECIMAL_CONVERT_TO_NUMERIC_READER
-#undef DEFINE_DECIMAL_CONVERT_TO_DECIMAL_READER
-#undef CASE_CREATE_FROM_DECIMAL_READER
-#undef CASE_CREATE_READER
-#undef CASE_EXCEPTION
-
} // namespace orc
diff --git a/c++/src/CpuInfoUtil.cc b/c++/src/CpuInfoUtil.cc
index 82669de20a..588f8dc96a 100644
--- a/c++/src/CpuInfoUtil.cc
+++ b/c++/src/CpuInfoUtil.cc
@@ -74,7 +74,7 @@ namespace orc {
#if defined(_WIN32)
//------------------------------ WINDOWS ------------------------------//
- void OsRetrieveCacheSize(std::array* cache_sizes) {
+ void OsRetrieveCacheSize(std::array* cacheSizes) {
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = nullptr;
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer_position = nullptr;
DWORD buffer_size = 0;
@@ -108,8 +108,8 @@ namespace orc {
if (RelationCache == buffer_position->Relationship) {
PCACHE_DESCRIPTOR cache = &buffer_position->Cache;
if (cache->Level >= 1 && cache->Level <= kCacheLevels) {
- const int64_t current = (*cache_sizes)[cache->Level - 1];
- (*cache_sizes)[cache->Level - 1] = std::max(current, cache->Size);
+ const int64_t current = (*cacheSizes)[cache->Level - 1];
+ (*cacheSizes)[cache->Level - 1] = std::max(current, cache->Size);
}
}
offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
@@ -136,23 +136,22 @@ namespace orc {
}
#endif // MINGW
- void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor,
- std::string* model_name) {
+ void OsRetrieveCpuInfo(int64_t* hardwareFlags, CpuInfo::Vendor* vendor,
+ std::string* modelName) {
int register_EAX_id = 1;
int highest_valid_id = 0;
int highest_extended_valid_id = 0;
std::bitset<32> features_ECX;
- std::array cpu_info;
+ std::array cpuInfo;
// Get highest valid id
- __cpuid(cpu_info.data(), 0);
- highest_valid_id = cpu_info[0];
+ __cpuid(cpuInfo.data(), 0);
+ highest_valid_id = cpuInfo[0];
// HEX of "GenuineIntel": 47656E75 696E6549 6E74656C
// HEX of "AuthenticAMD": 41757468 656E7469 63414D44
- if (cpu_info[1] == 0x756e6547 && cpu_info[3] == 0x49656e69 && cpu_info[2] == 0x6c65746e) {
+ if (cpuInfo[1] == 0x756e6547 && cpuInfo[3] == 0x49656e69 && cpuInfo[2] == 0x6c65746e) {
*vendor = CpuInfo::Vendor::Intel;
- } else if (cpu_info[1] == 0x68747541 && cpu_info[3] == 0x69746e65 &&
- cpu_info[2] == 0x444d4163) {
+ } else if (cpuInfo[1] == 0x68747541 && cpuInfo[3] == 0x69746e65 && cpuInfo[2] == 0x444d4163) {
*vendor = CpuInfo::Vendor::AMD;
}
@@ -161,19 +160,19 @@ namespace orc {
}
// EAX=1: Processor Info and Feature Bits
- __cpuidex(cpu_info.data(), register_EAX_id, 0);
- features_ECX = cpu_info[2];
+ __cpuidex(cpuInfo.data(), register_EAX_id, 0);
+ features_ECX = cpuInfo[2];
// Get highest extended id
- __cpuid(cpu_info.data(), 0x80000000);
- highest_extended_valid_id = cpu_info[0];
+ __cpuid(cpuInfo.data(), 0x80000000);
+ highest_extended_valid_id = cpuInfo[0];
// Retrieve CPU model name
if (highest_extended_valid_id >= static_cast(0x80000004)) {
- model_name->clear();
+ modelName->clear();
for (int i = 0x80000002; i <= static_cast(0x80000004); ++i) {
- __cpuidex(cpu_info.data(), i, 0);
- *model_name += std::string(reinterpret_cast(cpu_info.data()), sizeof(cpu_info));
+ __cpuidex(cpuInfo.data(), i, 0);
+ *modelName += std::string(reinterpret_cast(cpuInfo.data()), sizeof(cpuInfo));
}
}
@@ -184,37 +183,37 @@ namespace orc {
zmm_enabled = (xcr0 & 0xE0) == 0xE0;
}
- if (features_ECX[9]) *hardware_flags |= CpuInfo::SSSE3;
- if (features_ECX[19]) *hardware_flags |= CpuInfo::SSE4_1;
- if (features_ECX[20]) *hardware_flags |= CpuInfo::SSE4_2;
- if (features_ECX[23]) *hardware_flags |= CpuInfo::POPCNT;
- if (features_ECX[28]) *hardware_flags |= CpuInfo::AVX;
+ if (features_ECX[9]) *hardwareFlags |= CpuInfo::SSSE3;
+ if (features_ECX[19]) *hardwareFlags |= CpuInfo::SSE4_1;
+ if (features_ECX[20]) *hardwareFlags |= CpuInfo::SSE4_2;
+ if (features_ECX[23]) *hardwareFlags |= CpuInfo::POPCNT;
+ if (features_ECX[28]) *hardwareFlags |= CpuInfo::AVX;
// cpuid with EAX=7, ECX=0: Extended Features
register_EAX_id = 7;
if (highest_valid_id > register_EAX_id) {
- __cpuidex(cpu_info.data(), register_EAX_id, 0);
- std::bitset<32> features_EBX = cpu_info[1];
+ __cpuidex(cpuInfo.data(), register_EAX_id, 0);
+ std::bitset<32> features_EBX = cpuInfo[1];
- if (features_EBX[3]) *hardware_flags |= CpuInfo::BMI1;
- if (features_EBX[5]) *hardware_flags |= CpuInfo::AVX2;
- if (features_EBX[8]) *hardware_flags |= CpuInfo::BMI2;
+ if (features_EBX[3]) *hardwareFlags |= CpuInfo::BMI1;
+ if (features_EBX[5]) *hardwareFlags |= CpuInfo::AVX2;
+ if (features_EBX[8]) *hardwareFlags |= CpuInfo::BMI2;
if (zmm_enabled) {
- if (features_EBX[16]) *hardware_flags |= CpuInfo::AVX512F;
- if (features_EBX[17]) *hardware_flags |= CpuInfo::AVX512DQ;
- if (features_EBX[28]) *hardware_flags |= CpuInfo::AVX512CD;
- if (features_EBX[30]) *hardware_flags |= CpuInfo::AVX512BW;
- if (features_EBX[31]) *hardware_flags |= CpuInfo::AVX512VL;
+ if (features_EBX[16]) *hardwareFlags |= CpuInfo::AVX512F;
+ if (features_EBX[17]) *hardwareFlags |= CpuInfo::AVX512DQ;
+ if (features_EBX[28]) *hardwareFlags |= CpuInfo::AVX512CD;
+ if (features_EBX[30]) *hardwareFlags |= CpuInfo::AVX512BW;
+ if (features_EBX[31]) *hardwareFlags |= CpuInfo::AVX512VL;
}
}
}
#elif defined(CPUINFO_ARCH_ARM)
// Windows on Arm
- void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor,
- std::string* model_name) {
- *hardware_flags |= CpuInfo::ASIMD;
- // TODO: vendor, model_name
+ void OsRetrieveCpuInfo(int64_t* hardwareFlags, CpuInfo::Vendor* vendor,
+ std::string* modelName) {
+ *hardwareFlags |= CpuInfo::ASIMD;
+ // TODO: vendor, modelName
}
#endif
@@ -236,25 +235,25 @@ namespace orc {
return std::nullopt;
}
- void OsRetrieveCacheSize(std::array* cache_sizes) {
+ void OsRetrieveCacheSize(std::array* cacheSizes) {
static_assert(kCacheLevels >= 3, "");
auto c = IntegerSysCtlByName("hw.l1dcachesize");
if (c.has_value()) {
- (*cache_sizes)[0] = *c;
+ (*cacheSizes)[0] = *c;
}
c = IntegerSysCtlByName("hw.l2cachesize");
if (c.has_value()) {
- (*cache_sizes)[1] = *c;
+ (*cacheSizes)[1] = *c;
}
c = IntegerSysCtlByName("hw.l3cachesize");
if (c.has_value()) {
- (*cache_sizes)[2] = *c;
+ (*cacheSizes)[2] = *c;
}
}
- void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor,
- std::string* model_name) {
- // hardware_flags
+ void OsRetrieveCpuInfo(int64_t* hardwareFlags, CpuInfo::Vendor* vendor,
+ std::string* modelName) {
+ // hardwareFlags
struct SysCtlCpuFeature {
const char* name;
int64_t flag;
@@ -280,13 +279,13 @@ namespace orc {
for (const auto& feature : features) {
auto v = IntegerSysCtlByName(feature.name);
if (v.value_or(0)) {
- *hardware_flags |= feature.flag;
+ *hardwareFlags |= feature.flag;
}
}
- // TODO: vendor, model_name
+ // TODO: vendor, modelName
*vendor = CpuInfo::Vendor::Unknown;
- *model_name = "Unknown";
+ *modelName = "Unknown";
}
#else
@@ -345,7 +344,7 @@ namespace orc {
const struct {
std::string name;
int64_t flag;
- } flag_mappings[] = {
+ } flagMappings[] = {
#if defined(CPUINFO_ARCH_X86)
{"ssse3", CpuInfo::SSSE3},
{"sse4_1", CpuInfo::SSE4_1},
@@ -364,12 +363,12 @@ namespace orc {
{"asimd", CpuInfo::ASIMD},
#endif
};
- const int64_t num_flags = sizeof(flag_mappings) / sizeof(flag_mappings[0]);
+ const int64_t num_flags = sizeof(flagMappings) / sizeof(flagMappings[0]);
int64_t flags = 0;
for (int i = 0; i < num_flags; ++i) {
- if (values.find(flag_mappings[i].name) != std::string::npos) {
- flags |= flag_mappings[i].flag;
+ if (values.find(flagMappings[i].name) != std::string::npos) {
+ flags |= flagMappings[i].flag;
}
}
return flags;
@@ -469,9 +468,9 @@ namespace orc {
#elif defined(CPUINFO_ARCH_ARM)
//------------------------------ AARCH64 ------------------------------//
- bool ArchParseUserSimdLevel(const std::string& simd_level, int64_t* hardware_flags) {
- if (simd_level == "NONE") {
- *hardware_flags &= ~CpuInfo::ASIMD;
+ bool ArchParseUserSimdLevel(const std::string& simdLevel, int64_t* hardwareFlags) {
+ if (simdLevel == "NONE") {
+ *hardwareFlags &= ~CpuInfo::ASIMD;
return true;
}
return false;
@@ -485,7 +484,7 @@ namespace orc {
#else
//------------------------------ PPC, ... ------------------------------//
- bool ArchParseUserSimdLevel(const std::string& simd_level, int64_t* hardware_flags) {
+ bool ArchParseUserSimdLevel(const std::string& simdLevel, int64_t* hardwareFlags) {
return true;
}
@@ -496,17 +495,17 @@ namespace orc {
} // namespace
struct CpuInfo::Impl {
- int64_t hardware_flags = 0;
+ int64_t hardwareFlags = 0;
int numCores = 0;
- int64_t original_hardware_flags = 0;
+ int64_t originalHardwareFlags = 0;
Vendor vendor = Vendor::Unknown;
- std::string model_name = "Unknown";
- std::array cache_sizes{};
+ std::string modelName = "Unknown";
+ std::array cacheSizes{};
Impl() {
- OsRetrieveCacheSize(&cache_sizes);
- OsRetrieveCpuInfo(&hardware_flags, &vendor, &model_name);
- original_hardware_flags = hardware_flags;
+ OsRetrieveCacheSize(&cacheSizes);
+ OsRetrieveCpuInfo(&hardwareFlags, &vendor, &modelName);
+ originalHardwareFlags = hardwareFlags;
numCores = std::max(static_cast(std::thread::hardware_concurrency()), 1);
// parse user simd level
@@ -514,7 +513,7 @@ namespace orc {
std::string userSimdLevel = maybe_env_var == nullptr ? "NONE" : std::string(maybe_env_var);
std::transform(userSimdLevel.begin(), userSimdLevel.end(), userSimdLevel.begin(),
[](unsigned char c) { return std::toupper(c); });
- if (!ArchParseUserSimdLevel(userSimdLevel, &hardware_flags)) {
+ if (!ArchParseUserSimdLevel(userSimdLevel, &hardwareFlags)) {
throw ParseError("Invalid value for ORC_USER_SIMD_LEVEL: " + userSimdLevel);
}
}
@@ -530,8 +529,8 @@ namespace orc {
#endif
const CpuInfo* CpuInfo::getInstance() {
- static CpuInfo cpu_info;
- return &cpu_info;
+ static CpuInfo cpuInfo;
+ return &cpuInfo;
}
#ifdef __clang__
@@ -539,7 +538,7 @@ namespace orc {
#endif
int64_t CpuInfo::hardwareFlags() const {
- return impl_->hardware_flags;
+ return impl_->hardwareFlags;
}
int CpuInfo::numCores() const {
@@ -551,7 +550,7 @@ namespace orc {
}
const std::string& CpuInfo::modelName() const {
- return impl_->model_name;
+ return impl_->modelName;
}
int64_t CpuInfo::cacheSize(CacheLevel level) const {
@@ -564,18 +563,18 @@ namespace orc {
static_assert(static_cast(CacheLevel::L1) == 0, "");
const int i = static_cast(level);
- if (impl_->cache_sizes[i] > 0) return impl_->cache_sizes[i];
+ if (impl_->cacheSizes[i] > 0) return impl_->cacheSizes[i];
if (i == 0) return kDefaultCacheSizes[0];
// l3 may be not available, return maximum of l2 or default size
- return std::max(kDefaultCacheSizes[i], impl_->cache_sizes[i - 1]);
+ return std::max(kDefaultCacheSizes[i], impl_->cacheSizes[i - 1]);
}
bool CpuInfo::isSupported(int64_t flags) const {
- return (impl_->hardware_flags & flags) == flags;
+ return (impl_->hardwareFlags & flags) == flags;
}
bool CpuInfo::isDetected(int64_t flags) const {
- return (impl_->original_hardware_flags & flags) == flags;
+ return (impl_->originalHardwareFlags & flags) == flags;
}
void CpuInfo::verifyCpuRequirements() const {
diff --git a/c++/src/Exceptions.cc b/c++/src/Exceptions.cc
index 30ecf7dc7c..2ba1ab404c 100644
--- a/c++/src/Exceptions.cc
+++ b/c++/src/Exceptions.cc
@@ -84,4 +84,20 @@ namespace orc {
SchemaEvolutionError::~SchemaEvolutionError() noexcept {
// PASS
}
+
+ CompressionError::CompressionError(const std::string& whatArg) : runtime_error(whatArg) {
+ // PASS
+ }
+
+ CompressionError::CompressionError(const char* whatArg) : runtime_error(whatArg) {
+ // PASS
+ }
+
+ CompressionError::CompressionError(const CompressionError& error) : runtime_error(error) {
+ // PASS
+ }
+
+ CompressionError::~CompressionError() noexcept {
+ // PASS
+ }
} // namespace orc
diff --git a/c++/src/Geospatial.cc b/c++/src/Geospatial.cc
new file mode 100644
index 0000000000..6d7d268703
--- /dev/null
+++ b/c++/src/Geospatial.cc
@@ -0,0 +1,307 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This file contains code adapted from the Apache Arrow project.
+ *
+ * Original source:
+ * https://github.com/apache/arrow/blob/main/cpp/src/parquet/geospatial/statistics.cc
+ *
+ * The original code is licensed under the Apache License, Version 2.0.
+ *
+ * Modifications may have been made from the original source.
+ */
+
+#include "orc/Geospatial.hh"
+#include "orc/Exceptions.hh"
+
+#include "Geospatial.hh"
+
+#include
+#include
+#include