|
2 | 2 | "cells": [ |
3 | 3 | { |
4 | 4 | "cell_type": "markdown", |
| 5 | + "id": "88d22adb", |
5 | 6 | "metadata": {}, |
6 | 7 | "source": [ |
7 | 8 | "## Amazon Textract Queries\n", |
|
17 | 18 | }, |
18 | 19 | { |
19 | 20 | "cell_type": "code", |
20 | | - "execution_count": 29, |
| 21 | + "execution_count": 1, |
| 22 | + "id": "b566653d", |
21 | 23 | "metadata": {}, |
22 | 24 | "outputs": [ |
23 | 25 | { |
|
27 | 29 | "<IPython.core.display.Image object>" |
28 | 30 | ] |
29 | 31 | }, |
30 | | - "execution_count": 29, |
| 32 | + "execution_count": 1, |
31 | 33 | "metadata": {}, |
32 | 34 | "output_type": "execute_result" |
33 | 35 | } |
|
40 | 42 | }, |
41 | 43 | { |
42 | 44 | "cell_type": "markdown", |
| 45 | + "id": "a07d30b2", |
43 | 46 | "metadata": {}, |
44 | 47 | "source": [ |
45 | 48 | "Install the base framework to call Amazon Textract" |
46 | 49 | ] |
47 | 50 | }, |
48 | 51 | { |
49 | 52 | "cell_type": "code", |
50 | | - "execution_count": 30, |
| 53 | + "execution_count": 2, |
| 54 | + "id": "792e2d7a", |
51 | 55 | "metadata": {}, |
52 | 56 | "outputs": [ |
53 | 57 | { |
54 | 58 | "name": "stdout", |
55 | 59 | "output_type": "stream", |
56 | 60 | "text": [ |
57 | | - "/opt/conda/lib/python3.7/site-packages/secretstorage/dhcrypto.py:16: CryptographyDeprecationWarning: int_from_bytes is deprecated, use int.from_bytes instead\n", |
58 | | - " from cryptography.utils import int_from_bytes\n", |
59 | | - "/opt/conda/lib/python3.7/site-packages/secretstorage/util.py:25: CryptographyDeprecationWarning: int_from_bytes is deprecated, use int.from_bytes instead\n", |
60 | | - " from cryptography.utils import int_from_bytes\n", |
61 | | - "Processing ./amazon_textract_response_parser-0.1.21-py2.py3-none-any.whl\n", |
62 | | - "Requirement already satisfied: boto3 in /opt/conda/lib/python3.7/site-packages (1.20.4)\n", |
63 | | - "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /opt/conda/lib/python3.7/site-packages (from boto3) (0.10.0)\n", |
64 | | - "Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /opt/conda/lib/python3.7/site-packages (from boto3) (0.5.0)\n", |
65 | | - "Requirement already satisfied: botocore<1.24.0,>=1.23.4 in /opt/conda/lib/python3.7/site-packages (from boto3) (1.23.4)\n", |
66 | | - "Requirement already satisfied: marshmallow==3.11.1 in /opt/conda/lib/python3.7/site-packages (from amazon-textract-response-parser==0.1.21) (3.11.1)\n", |
67 | | - "Requirement already satisfied: urllib3<1.27,>=1.25.4 in /opt/conda/lib/python3.7/site-packages (from botocore<1.24.0,>=1.23.4->boto3) (1.26.7)\n", |
68 | | - "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /opt/conda/lib/python3.7/site-packages (from botocore<1.24.0,>=1.23.4->boto3) (2.8.1)\n", |
69 | | - "Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.24.0,>=1.23.4->boto3) (1.14.0)\n", |
70 | | - "amazon-textract-response-parser is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.\n", |
71 | | - "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\n", |
72 | | - "\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.3 is available.\n", |
73 | | - "You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.\u001b[0m\n" |
| 61 | + "\u001b[33mWARNING: You are using pip version 21.1.3; however, version 22.0.4 is available.\r\n", |
| 62 | + "You should consider upgrading via the '/Users/schadem/.pyenv/versions/amazon-textract-code-samples/bin/python -m pip install --upgrade pip' command.\u001b[0m\r\n" |
74 | 63 | ] |
75 | 64 | } |
76 | 65 | ], |
77 | 66 | "source": [ |
78 | | - "!python -m pip install boto3 amazon_textract_response_parser-0.1.21-py2.py3-none-any.whl" |
| 67 | + "!python -m pip install -q boto3 amazon_textract_response_parser --upgrade" |
79 | 68 | ] |
80 | 69 | }, |
81 | 70 | { |
82 | 71 | "cell_type": "code", |
83 | | - "execution_count": 31, |
| 72 | + "execution_count": 3, |
| 73 | + "id": "2a26ff9d", |
84 | 74 | "metadata": {}, |
85 | 75 | "outputs": [], |
86 | 76 | "source": [ |
|
95 | 85 | }, |
96 | 86 | { |
97 | 87 | "cell_type": "code", |
98 | | - "execution_count": 32, |
| 88 | + "execution_count": 4, |
| 89 | + "id": "5bbccab5", |
99 | 90 | "metadata": {}, |
100 | 91 | "outputs": [], |
101 | 92 | "source": [ |
|
157 | 148 | }, |
158 | 149 | { |
159 | 150 | "cell_type": "markdown", |
| 151 | + "id": "048dbb41", |
160 | 152 | "metadata": {}, |
161 | 153 | "source": [ |
162 | 154 | "The response does include all OCR WORDS and LINES, geometry information, confidence scores.\n", |
|
168 | 160 | }, |
169 | 161 | { |
170 | 162 | "cell_type": "code", |
171 | | - "execution_count": 33, |
| 163 | + "execution_count": 7, |
| 164 | + "id": "bacbcd15", |
172 | 165 | "metadata": {}, |
173 | 166 | "outputs": [ |
174 | 167 | { |
175 | 168 | "name": "stdout", |
176 | 169 | "output_type": "stream", |
177 | 170 | "text": [ |
178 | | - "/opt/conda/lib/python3.7/site-packages/secretstorage/dhcrypto.py:16: CryptographyDeprecationWarning: int_from_bytes is deprecated, use int.from_bytes instead\n", |
179 | | - " from cryptography.utils import int_from_bytes\n", |
180 | | - "/opt/conda/lib/python3.7/site-packages/secretstorage/util.py:25: CryptographyDeprecationWarning: int_from_bytes is deprecated, use int.from_bytes instead\n", |
181 | | - " from cryptography.utils import int_from_bytes\n", |
182 | | - "Requirement already satisfied: tabulate in /opt/conda/lib/python3.7/site-packages (0.8.9)\n", |
183 | | - "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\n", |
184 | | - "\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.3 is available.\n", |
185 | | - "You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.\u001b[0m\n" |
| 171 | + "\u001b[33mWARNING: You are using pip version 21.1.3; however, version 22.0.4 is available.\r\n", |
| 172 | + "You should consider upgrading via the '/Users/schadem/.pyenv/versions/amazon-textract-code-samples/bin/python -m pip install --upgrade pip' command.\u001b[0m\r\n" |
186 | 173 | ] |
187 | 174 | } |
188 | 175 | ], |
189 | 176 | "source": [ |
190 | 177 | "# We use tabulate to pretty print the output\n", |
191 | | - "!python -m pip install tabulate" |
| 178 | + "!python -m pip -q install tabulate" |
192 | 179 | ] |
193 | 180 | }, |
194 | 181 | { |
195 | 182 | "cell_type": "code", |
196 | | - "execution_count": 34, |
| 183 | + "execution_count": 6, |
| 184 | + "id": "29f11a07", |
197 | 185 | "metadata": {}, |
198 | 186 | "outputs": [ |
199 | 187 | { |
200 | 188 | "name": "stdout", |
201 | 189 | "output_type": "stream", |
202 | 190 | "text": [ |
203 | 191 | "|-------------------------------------|-----------------------------------|---------------|\n", |
204 | | - "| What is the insured name? | INSURANCE_CARD_NAME | Jacob Michael |\n", |
205 | | - "| What is the level of benefits? | INSURANCE_CARD_LEVEL_BENEFITS | SILVER |\n", |
206 | | - "| What is medical insurance provider? | INSURANCE_CARD_PROVIDER | Anthem |\n", |
207 | | - "| What is the OOP max? | INSURANCE_CARD_OOP_MAX | $6000/$12000 |\n", |
208 | | - "| What is the effective date? | INSURANCE_CARD_EFFECTIVE_DATE | 11/02/2021 |\n", |
209 | | - "| What is the office visit copay? | INSURANCE_CARD_OFFICE_VISIT_COPAY | $55/0% |\n", |
210 | 192 | "| What is the specialist visit copay? | INSURANCE_CARD_SPEC_VISIT_COPAY | $65/0% |\n", |
| 193 | + "| What is the effective date? | INSURANCE_CARD_EFFECTIVE_DATE | 11/02/2021 |\n", |
211 | 194 | "| What is the member id? | INSURANCE_CARD_MEMBER_ID | XZ 9147589652 |\n", |
212 | 195 | "| What is the plan type? | INSURANCE_CARD_PLAN_TYPE | Pathway X-EPO |\n", |
213 | | - "| What is the coinsurance amount? | INSURANCE_CARD_COINSURANCE | 30% |\n" |
| 196 | + "| What is the OOP max? | INSURANCE_CARD_OOP_MAX | $6000/$12000 |\n", |
| 197 | + "| What is the level of benefits? | INSURANCE_CARD_LEVEL_BENEFITS | SILVER |\n", |
| 198 | + "| What is the office visit copay? | INSURANCE_CARD_OFFICE_VISIT_COPAY | $55/0% |\n", |
| 199 | + "| What is the coinsurance amount? | INSURANCE_CARD_COINSURANCE | 30% |\n", |
| 200 | + "| What is the insured name? | INSURANCE_CARD_NAME | Jacob Michael |\n", |
| 201 | + "| What is medical insurance provider? | INSURANCE_CARD_PROVIDER | Anthem |\n" |
214 | 202 | ] |
215 | 203 | } |
216 | 204 | ], |
|
228 | 216 | }, |
229 | 217 | { |
230 | 218 | "cell_type": "markdown", |
| 219 | + "id": "08b94690", |
231 | 220 | "metadata": {}, |
232 | 221 | "source": [ |
233 | 222 | "## Conclusion" |
234 | 223 | ] |
235 | 224 | }, |
236 | 225 | { |
237 | 226 | "cell_type": "markdown", |
| 227 | + "id": "3b5d55e4", |
238 | 228 | "metadata": {}, |
239 | 229 | "source": [ |
240 | 230 | "Textract Query does not require any training and can be used on any document type, even complex ones with high variance in layout across document type like paystubs or bank statements yield high accuracy.\n", |
|
247 | 237 | "metadata": { |
248 | 238 | "instance_type": "ml.t3.medium", |
249 | 239 | "kernelspec": { |
250 | | - "display_name": "Python 3 (Data Science)", |
| 240 | + "display_name": "Python 3 (ipykernel)", |
251 | 241 | "language": "python", |
252 | | - "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:081325390199:image/datascience-1.0" |
| 242 | + "name": "python3" |
253 | 243 | }, |
254 | 244 | "language_info": { |
255 | 245 | "codemirror_mode": { |
|
261 | 251 | "name": "python", |
262 | 252 | "nbconvert_exporter": "python", |
263 | 253 | "pygments_lexer": "ipython3", |
264 | | - "version": "3.7.10" |
| 254 | + "version": "3.9.6" |
265 | 255 | } |
266 | 256 | }, |
267 | 257 | "nbformat": 4, |
|
0 commit comments