Learning Oracle Application and Software Testing

Tuesday, August 30, 2022

How to Extract Text from PDFs with Python Pypdf2?

import encodings
from PyPDF2 import PdfFileReader
from pathlib import Path
import glob
import json
import re
import pymysql

for pdfFile in Path("pdfs").glob("*.pdf"):

# Create pdf file reader object
pdf = PdfFileReader(pdfFile)

# Grab the page(s)
page_1_object = pdf.getPage(0)

# Extract text
page_1_text = page_1_object.extractText()

# Combine the text from all the pages and save as txt file
with open("txts/{}.txt".format(pdfFile.stem), mode='w', encoding="utf-8") as file:
for page in pdf.pages:
text = ''
text += page.extractText()
file.write(text)
file.close

👋 Hi, I'm Suriya — QA Engineer with 4+ years of experience in manual, API & automation testing.

📬 Contact Me | LinkedIn | GitHub

📌 Follow for: Real-Time Test Cases, Bug Reports, Selenium Frameworks.

Monday, August 22, 2022

Retrieve Image as a BLOB from MySQL Table using Python

# Import the required modules
import mysql.connector
import base64
from PIL import Image
import io

# For security reasons, never expose your password
#password = open('password','r').readline()

# Create a connection
mydb = mysql.connector.connect(
host="host",
user="suriyaparithy",
password="suriyaparithy",
database="database" # Name of the database
)

# Create a cursor object
cursor = mydb.cursor()

# Prepare the query
query = 'SELECT PICTURE FROM PROFILE WHERE ID=100'

# Execute the query to get the file
cursor.execute(query)
data = cursor.fetchall()

# The returned data will be a list of list
image = data[0][0]

# Decode the string
binary_data = base64.b64decode(image)

# Convert the bytes into a PIL image
image = Image.open(io.BytesIO(binary_data))

# Display the image
image.show()

👋 Hi, I'm Suriya — QA Engineer with 4+ years of experience in manual, API & automation testing.

📬 Contact Me | LinkedIn | GitHub

📌 Follow for: Real-Time Test Cases, Bug Reports, Selenium Frameworks.

Image File stored as a BLOB in MySQL Table using Python

# Import the required modules
import mysql.connector
import base64
from PIL import Image
import io

# Create a connection
mydb = mysql.connector.connect(
host="localhost",
user="suriyaparithy",
password="suriyaparithy",
database="database" # Name of the database
)

# Create a cursor object
cursor = mydb.cursor()

# Open a file in binary mode
file = open('chemical.PNG','rb').read()

# We must encode the file to get base64 string
file = base64.b64encode(file)

# Sample data to be inserted
args = ('100', 'Sample Name', file)

# Prepare a query
query = 'INSERT INTO PROFILE VALUES(%s, %s, %s)'

# Execute the query and commit the database.
cursor.execute(query,args)
mydb.commit()

👋 Hi, I'm Suriya — QA Engineer with 4+ years of experience in manual, API & automation testing.

📬 Contact Me | LinkedIn | GitHub

📌 Follow for: Real-Time Test Cases, Bug Reports, Selenium Frameworks.

Extract text from a single image using Python

#Extract text from a single image using Python
from PIL import Image
from pytesseract import pytesseract

#Define path to tessaract.exe
path_to_tesseract = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

#Define path to image
path_to_image = 'chemical.PNG'

#Point tessaract_cmd to tessaract.exe
pytesseract.tesseract_cmd = path_to_tesseract

#Open image with PIL
img = Image.open(path_to_image)

#Extract text from image
text = pytesseract.image_to_string(img)
print(text)

👋 Hi, I'm Suriya — QA Engineer with 4+ years of experience in manual, API & automation testing.

📬 Contact Me | LinkedIn | GitHub

📌 Follow for: Real-Time Test Cases, Bug Reports, Selenium Frameworks.

Extract only images from PDF using Python

# How to Extract Images from PDF in Python
import fitz # PyMuPDF
import io
from PIL import Image

# file path you want to extract images from
file = "byju.pdf"

# open the file
pdf_file = fitz.open(file)

# iterate over PDF pages
for page_index in range(len(pdf_file)):

# get the page itself
page = pdf_file[page_index]
image_list = page.get_images()

# printing number of images found in this page
if image_list:
print(f"[+] Found a total of {len(image_list)} images in page {page_index}")
else:
print("[!] No images found on page", page_index)
for image_index, img in enumerate(page.get_images(), start=1):

# get the XREF of the image
xref = img[0]

# extract the image bytes
base_image = pdf_file.extract_image(xref)
image_bytes = base_image["image"]

# get the image extension
image_ext = base_image["ext"]

# load it to PIL
image = Image.open(io.BytesIO(image_bytes))

# save it to local disk
image.save(open(f"image{page_index+1}_{image_index}.{image_ext}", "wb"))

👋 Hi, I'm Suriya — QA Engineer with 4+ years of experience in manual, API & automation testing.

📬 Contact Me | LinkedIn | GitHub

📌 Follow for: Real-Time Test Cases, Bug Reports, Selenium Frameworks.

Pages

Tuesday, August 30, 2022

How to Extract Text from PDFs with Python Pypdf2?

Monday, August 22, 2022

Retrieve Image as a BLOB from MySQL Table using Python

Image File stored as a BLOB in MySQL Table using Python

Extract text from a single image using Python

Extract only images from PDF using Python

Popular Posts

Blog Archive