using textract aws pdf example
import boto3
import time
def startJob(s3BucketName, objectName):
response = None
client = boto3.client('textract')
response = client.start_document_text_detection(
DocumentLocation={
'S3Object': {
'Bucket': s3BucketName,
'Name': objectName
}
})
return response["JobId"]
def isJobComplete(jobId):
# For production use cases, use SNS based notification
# Details at: https://docs.aws.amazon.com/textract/latest/dg/api-async.html
time.sleep(5)
client = boto3.client('textract')
response = client.get_document_text_detection(JobId=jobId)
status = response["JobStatus"]
print("Job status: {}".format(status))
while(status == "IN_PROGRESS"):
time.sleep(5)
response = client.get_document_text_detection(JobId=jobId)
status = response["JobStatus"]
print("Job status: {}".format(status))
return status
def getJobResults(jobId):
pages = []
client = boto3.client('textract')
response = client.get_document_text_detection(JobId=jobId)
pages.append(response)
print("Resultset page recieved: {}".format(len(pages)))
nextToken = None
if('NextToken' in response):
nextToken = response['NextToken']
while(nextToken):
response = client.get_document_text_detection(JobId=jobId, NextToken=nextToken)
pages.append(response)
print("Resultset page recieved: {}".format(len(pages)))
nextToken = None
if('NextToken' in response):
nextToken = response['NextToken']
return pages
# Document
s3BucketName = "ki-textract-demo-docs"
documentName = "Amazon-Textract-Pdf.pdf"
jobId = startJob(s3BucketName, documentName)
print("Started job with id: {}".format(jobId))
if(isJobComplete(jobId)):
response = getJobResults(jobId)
#print(response)
# Print detected text
for resultPage in response:
for item in resultPage["Blocks"]:
if item["BlockType"] == "LINE":
print ('\033[94m' + item["Text"] + '\033[0m')
Are there any code examples left?
New code examples in category Java
-
Java 2022-03-27 21:35:04 Sort string array in case insensitive order and case sensitive order java
-
Java 2022-03-27 21:25:10 java -jar -l resources\es.porperties -i ejemplo.txt -o inject.bin
-
Java 2022-03-27 21:20:21 Debug & Fix a 2-Dimensional Array Java Console Application
-
Java 2022-03-27 20:40:19 TreeSet headSet(E toElement) method in java
-
Java 2022-03-27 19:30:06 close keyboard android
-
Java 2022-03-27 19:20:40 java measure execution time
-
Java 2022-03-27 19:10:06 how to add cardview support in android studio
-
Java 2022-03-27 18:55:08 android studio lower case letters on a button