Tuesday, August 30, 2022

How to Extract Text from PDFs with Python Pypdf2?

import encodings
from PyPDF2 import PdfFileReader
from pathlib import Path
import glob
import json
import re
import pymysql

for pdfFile in Path("pdfs").glob("*.pdf"):

# Create pdf file reader object
  pdf = PdfFileReader(pdfFile)

# Grab the page(s)
  page_1_object = pdf.getPage(0)

# Extract text
  page_1_text = page_1_object.extractText()

# Combine the text from all the pages and save as txt file
with open("txts/{}.txt".format(pdfFile.stem), mode='w', encoding="utf-8") as file:
        for page in pdf.pages:
            text = ''
            text += page.extractText()
            file.write(text)
            file.close

Monday, August 22, 2022

Retrieve Image as a BLOB from MySQL Table using Python

 # Import the required modules
import mysql.connector
import base64
from PIL import Image
import io

# For security reasons, never expose your password
#password = open('password','r').readline()

# Create a connection
mydb = mysql.connector.connect(
host="host",
user="suriyaparithy",
password="suriyaparithy",
database="database" # Name of the database
)

# Create a cursor object
cursor = mydb.cursor()

# Prepare the query
query = 'SELECT PICTURE FROM PROFILE WHERE ID=100'

# Execute the query to get the file
cursor.execute(query)
data = cursor.fetchall()

# The returned data will be a list of list
image = data[0][0]

# Decode the string
binary_data = base64.b64decode(image)

# Convert the bytes into a PIL image
image = Image.open(io.BytesIO(binary_data))

# Display the image
image.show()

Image File stored as a BLOB in MySQL Table using Python

 # Import the required modules
import mysql.connector
import base64
from PIL import Image
import io

# Create a connection
mydb = mysql.connector.connect(
host="localhost",
user="suriyaparithy",
password="suriyaparithy",
database="database" # Name of the database
)

# Create a cursor object
cursor = mydb.cursor()

# Open a file in binary mode
file = open('chemical.PNG','rb').read()

# We must encode the file to get base64 string
file = base64.b64encode(file)

# Sample data to be inserted
args = ('100', 'Sample Name', file)

# Prepare a query
query = 'INSERT INTO PROFILE VALUES(%s, %s, %s)'

# Execute the query and commit the database.
cursor.execute(query,args)
mydb.commit()

Extract text from a single image using Python

#Extract text from a single image using Python
from PIL import Image
from pytesseract import pytesseract

#Define path to tessaract.exe
path_to_tesseract = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

#Define path to image
path_to_image = 'chemical.PNG'

#Point tessaract_cmd to tessaract.exe
pytesseract.tesseract_cmd = path_to_tesseract

#Open image with PIL
img = Image.open(path_to_image)

#Extract text from image
text = pytesseract.image_to_string(img)
print(text)

Extract only images from PDF using Python

 # How to Extract Images from PDF in Python
import fitz # PyMuPDF
import io
from PIL import Image

# file path you want to extract images from
file = "byju.pdf"

# open the file
pdf_file = fitz.open(file)

# iterate over PDF pages
for page_index in range(len(pdf_file)):

    # get the page itself
    page = pdf_file[page_index]
    image_list = page.get_images()

    # printing number of images found in this page
    if image_list:
        print(f"[+] Found a total of {len(image_list)} images in page {page_index}")
    else:
        print("[!] No images found on page", page_index)
    for image_index, img in enumerate(page.get_images(), start=1):

        # get the XREF of the image
        xref = img[0]

        # extract the image bytes
        base_image = pdf_file.extract_image(xref)
        image_bytes = base_image["image"]

        # get the image extension
        image_ext = base_image["ext"]

        # load it to PIL
        image = Image.open(io.BytesIO(image_bytes))

        # save it to local disk
        image.save(open(f"image{page_index+1}_{image_index}.{image_ext}", "wb"))

Extract text and image from PDF files in python

# Module Imports
import os
from PIL import Image
import pytesseract
from pdf2image import convert_from_path

# Define Paths
poppler_path = r'C:\Program Files\poppler-0.68.0\poppler-0.68.0\bin'
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract'
pdf_path = "chemistry.pdf"

# Save PDF pages to images
images = convert_from_path(pdf_path=pdf_path, poppler_path=poppler_path)
for count, img in enumerate(images):
    img_name = f"page_{count}.png"  
    img.save(img_name, "png")

# Extract Text
png_files = [f for f in os.listdir(".") if f.endswith(".png")]
for png_file in png_files:
    extracted_text = pytesseract.image_to_string(Image.open(png_file))
    print(extracted_text)

Reference - https://www.gcptutorials.com/post/python-extract-text-from-pdf-files 

Tuesday, August 2, 2022

Read excel file in selenium webdriver using jxl

 package coaching;

import java.io.FileInputStream;

import java.io.IOException;

import java.time.Duration;

import org.openqa.selenium.By;

import org.openqa.selenium.WebDriver;

import org.openqa.selenium.WebElement;

import org.openqa.selenium.chrome.ChromeDriver;

import org.testng.annotations.DataProvider;

import org.testng.annotations.Test;

import jxl.Sheet;

import jxl.Workbook;

import jxl.read.biff.BiffException;


public class JxlData {

WebDriver driver;

String [][] data = null;

@DataProvider(name="loginData")

public String [][] loginDataProvider() throws BiffException, IOException{

data=getExcelData();

// string change to object

// jxl jar used only xls format(97-2003 worksheet)

return data;

}

  public String[][] getExcelData() throws BiffException, IOException{

  

  FileInputStream excel = new FileInputStream("F:\\Suriya\\suri.xls");

   Workbook workbook = Workbook.getWorkbook(excel);

  Sheet sheet = workbook.getSheet(0); // sheet name

  int rowCount = sheet.getRows();

  int columnCount = sheet.getColumns();

  String testData[][] = new String[rowCount-1][columnCount];

  

  for (int i=1; i<rowCount;i++) {

  for (int j=0;j<columnCount;j++) {

  testData[i-1][j]=sheet.getCell(j, i).getContents();

  }

  }

   return testData;

  }

  @Test(dataProvider="loginData")

  public void login(String uName, String pword) {

  System.setProperty("webdriver.chrome.driver", "F:\\Suriya\\chromedriver.exe");

driver = new ChromeDriver();

driver.manage().window().maximize();

driver.manage().deleteAllCookies();

driver.manage().timeouts().pageLoadTimeout(Duration.ofSeconds(30));

driver.manage().timeouts().implicitlyWait(Duration.ofSeconds(30));

driver.get("URL");

WebElement username = driver.findElement(By.id("userName"));

username.sendKeys(uName);


WebElement password = driver.findElement(By.id("pwd"));

password.sendKeys(pword);


WebElement login = driver.findElement(By.cssSelector(".btn:nth-child(3)"));

login.click();

    }

}


Selenium using Data Provider Method

 package coaching;


import java.time.Duration;

import org.openqa.selenium.By;

import org.openqa.selenium.WebDriver;

import org.openqa.selenium.WebElement;

import org.openqa.selenium.chrome.ChromeDriver;

import org.testng.annotations.DataProvider;

import org.testng.annotations.Test;


public class DataProviderTest {

WebDriver driver;

@DataProvider(name = "Authentication")

 public static Object[][] credentials() {

           // The number of times data is repeated, test will be executed the same no. of times

          // Here it will execute two times

           return new Object[][] { { "suriya", "Test@123" }, { "parithy", "Test@123" }};

     }

@Test(dataProvider = "Authentication")


  public void test(String username, String password) {

System.setProperty("webdriver.chrome.driver", "F:\\Suriya\\chromedriver.exe");

driver = new ChromeDriver();

driver.manage().window().maximize();

driver.manage().deleteAllCookies();

driver.manage().timeouts().pageLoadTimeout(Duration.ofSeconds(30));

driver.manage().timeouts().implicitlyWait(Duration.ofSeconds(30));

driver.get("URL");

WebElement username1 = driver.findElement(By.id("userName"));

username1.sendKeys(username);

                       WebElement password1 = driver.findElement(By.id("pwd"));

password1.sendKeys(password);

                      WebElement login = driver.findElement(By.cssSelector(".btn:nth-child(3)"));

login.click();

   }

}


Nodejs get data from Mysql database

 var mysql = require('mysql');

var con = mysql.createConnection({

  host: "localhost",

  user: "root",

  password: "12345",

  port: "3000",

  database: "project",

});

con.connect(function(err) {

  if (err) throw err;

  console.log("Connected!");

});

con.query('SELECT * FROM users', function (err, rows, fields) {

    if (err) throw err;

  console.log('The solution is: ', rows[2].standard);

  });

  con.end();

Convert PDF to Word using Python

 # Importing the Converter() class
from pdf2docx import Converter

# Specifying the pdf & docx files
pdf_file = 'ieep.pdf'
docx_file = 'sample.docx'
try:
cv = Converter(pdf_file)
cv.convert(docx_file)
cv.close()
except:
    print('Conversion Failed')
else:
    print('File Converted Successfully')