Tuesday, August 30, 2022

How to Extract Text from PDFs with Python Pypdf2?

import encodings
from PyPDF2 import PdfFileReader
from pathlib import Path
import glob
import json
import re
import pymysql

for pdfFile in Path("pdfs").glob("*.pdf"):

# Create pdf file reader object
  pdf = PdfFileReader(pdfFile)

# Grab the page(s)
  page_1_object = pdf.getPage(0)

# Extract text
  page_1_text = page_1_object.extractText()

# Combine the text from all the pages and save as txt file
with open("txts/{}.txt".format(pdfFile.stem), mode='w', encoding="utf-8") as file:
        for page in pdf.pages:
            text = ''
            text += page.extractText()
            file.write(text)
            file.close

👋 Hi, I'm Suriya — QA Engineer with 4+ years of experience in manual, API & automation testing.

📬 Contact Me | LinkedIn | GitHub

📌 Follow for: Real-Time Test Cases, Bug Reports, Selenium Frameworks.

Monday, August 22, 2022

Retrieve Image as a BLOB from MySQL Table using Python

 # Import the required modules
import mysql.connector
import base64
from PIL import Image
import io

# For security reasons, never expose your password
#password = open('password','r').readline()

# Create a connection
mydb = mysql.connector.connect(
host="host",
user="suriyaparithy",
password="suriyaparithy",
database="database" # Name of the database
)

# Create a cursor object
cursor = mydb.cursor()

# Prepare the query
query = 'SELECT PICTURE FROM PROFILE WHERE ID=100'

# Execute the query to get the file
cursor.execute(query)
data = cursor.fetchall()

# The returned data will be a list of list
image = data[0][0]

# Decode the string
binary_data = base64.b64decode(image)

# Convert the bytes into a PIL image
image = Image.open(io.BytesIO(binary_data))

# Display the image
image.show()

👋 Hi, I'm Suriya — QA Engineer with 4+ years of experience in manual, API & automation testing.

📬 Contact Me | LinkedIn | GitHub

📌 Follow for: Real-Time Test Cases, Bug Reports, Selenium Frameworks.

Image File stored as a BLOB in MySQL Table using Python

 # Import the required modules
import mysql.connector
import base64
from PIL import Image
import io

# Create a connection
mydb = mysql.connector.connect(
host="localhost",
user="suriyaparithy",
password="suriyaparithy",
database="database" # Name of the database
)

# Create a cursor object
cursor = mydb.cursor()

# Open a file in binary mode
file = open('chemical.PNG','rb').read()

# We must encode the file to get base64 string
file = base64.b64encode(file)

# Sample data to be inserted
args = ('100', 'Sample Name', file)

# Prepare a query
query = 'INSERT INTO PROFILE VALUES(%s, %s, %s)'

# Execute the query and commit the database.
cursor.execute(query,args)
mydb.commit()

👋 Hi, I'm Suriya — QA Engineer with 4+ years of experience in manual, API & automation testing.

📬 Contact Me | LinkedIn | GitHub

📌 Follow for: Real-Time Test Cases, Bug Reports, Selenium Frameworks.

Extract text from a single image using Python

#Extract text from a single image using Python
from PIL import Image
from pytesseract import pytesseract

#Define path to tessaract.exe
path_to_tesseract = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

#Define path to image
path_to_image = 'chemical.PNG'

#Point tessaract_cmd to tessaract.exe
pytesseract.tesseract_cmd = path_to_tesseract

#Open image with PIL
img = Image.open(path_to_image)

#Extract text from image
text = pytesseract.image_to_string(img)
print(text)

👋 Hi, I'm Suriya — QA Engineer with 4+ years of experience in manual, API & automation testing.

📬 Contact Me | LinkedIn | GitHub

📌 Follow for: Real-Time Test Cases, Bug Reports, Selenium Frameworks.

Extract only images from PDF using Python

 # How to Extract Images from PDF in Python
import fitz # PyMuPDF
import io
from PIL import Image

# file path you want to extract images from
file = "byju.pdf"

# open the file
pdf_file = fitz.open(file)

# iterate over PDF pages
for page_index in range(len(pdf_file)):

    # get the page itself
    page = pdf_file[page_index]
    image_list = page.get_images()

    # printing number of images found in this page
    if image_list:
        print(f"[+] Found a total of {len(image_list)} images in page {page_index}")
    else:
        print("[!] No images found on page", page_index)
    for image_index, img in enumerate(page.get_images(), start=1):

        # get the XREF of the image
        xref = img[0]

        # extract the image bytes
        base_image = pdf_file.extract_image(xref)
        image_bytes = base_image["image"]

        # get the image extension
        image_ext = base_image["ext"]

        # load it to PIL
        image = Image.open(io.BytesIO(image_bytes))

        # save it to local disk
        image.save(open(f"image{page_index+1}_{image_index}.{image_ext}", "wb"))

👋 Hi, I'm Suriya — QA Engineer with 4+ years of experience in manual, API & automation testing.

📬 Contact Me | LinkedIn | GitHub

📌 Follow for: Real-Time Test Cases, Bug Reports, Selenium Frameworks.