Monday, August 22, 2022

Extract text from a single image using Python

#Extract text from a single image using Python
from PIL import Image
from pytesseract import pytesseract

#Define path to tessaract.exe
path_to_tesseract = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

#Define path to image
path_to_image = 'chemical.PNG'

#Point tessaract_cmd to tessaract.exe
pytesseract.tesseract_cmd = path_to_tesseract

#Open image with PIL
img = Image.open(path_to_image)

#Extract text from image
text = pytesseract.image_to_string(img)
print(text)

Extract only images from PDF using Python

 # How to Extract Images from PDF in Python
import fitz # PyMuPDF
import io
from PIL import Image

# file path you want to extract images from
file = "byju.pdf"

# open the file
pdf_file = fitz.open(file)

# iterate over PDF pages
for page_index in range(len(pdf_file)):

    # get the page itself
    page = pdf_file[page_index]
    image_list = page.get_images()

    # printing number of images found in this page
    if image_list:
        print(f"[+] Found a total of {len(image_list)} images in page {page_index}")
    else:
        print("[!] No images found on page", page_index)
    for image_index, img in enumerate(page.get_images(), start=1):

        # get the XREF of the image
        xref = img[0]

        # extract the image bytes
        base_image = pdf_file.extract_image(xref)
        image_bytes = base_image["image"]

        # get the image extension
        image_ext = base_image["ext"]

        # load it to PIL
        image = Image.open(io.BytesIO(image_bytes))

        # save it to local disk
        image.save(open(f"image{page_index+1}_{image_index}.{image_ext}", "wb"))

Extract text and image from PDF files in python

# Module Imports
import os
from PIL import Image
import pytesseract
from pdf2image import convert_from_path

# Define Paths
poppler_path = r'C:\Program Files\poppler-0.68.0\poppler-0.68.0\bin'
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract'
pdf_path = "chemistry.pdf"

# Save PDF pages to images
images = convert_from_path(pdf_path=pdf_path, poppler_path=poppler_path)
for count, img in enumerate(images):
    img_name = f"page_{count}.png"  
    img.save(img_name, "png")

# Extract Text
png_files = [f for f in os.listdir(".") if f.endswith(".png")]
for png_file in png_files:
    extracted_text = pytesseract.image_to_string(Image.open(png_file))
    print(extracted_text)

Reference - https://www.gcptutorials.com/post/python-extract-text-from-pdf-files 

Tuesday, August 2, 2022

Read excel file in selenium webdriver using jxl

 package coaching;

import java.io.FileInputStream;

import java.io.IOException;

import java.time.Duration;

import org.openqa.selenium.By;

import org.openqa.selenium.WebDriver;

import org.openqa.selenium.WebElement;

import org.openqa.selenium.chrome.ChromeDriver;

import org.testng.annotations.DataProvider;

import org.testng.annotations.Test;

import jxl.Sheet;

import jxl.Workbook;

import jxl.read.biff.BiffException;


public class JxlData {

WebDriver driver;

String [][] data = null;

@DataProvider(name="loginData")

public String [][] loginDataProvider() throws BiffException, IOException{

data=getExcelData();

// string change to object

// jxl jar used only xls format(97-2003 worksheet)

return data;

}

  public String[][] getExcelData() throws BiffException, IOException{

  

  FileInputStream excel = new FileInputStream("F:\\Suriya\\suri.xls");

   Workbook workbook = Workbook.getWorkbook(excel);

  Sheet sheet = workbook.getSheet(0); // sheet name

  int rowCount = sheet.getRows();

  int columnCount = sheet.getColumns();

  String testData[][] = new String[rowCount-1][columnCount];

  

  for (int i=1; i<rowCount;i++) {

  for (int j=0;j<columnCount;j++) {

  testData[i-1][j]=sheet.getCell(j, i).getContents();

  }

  }

   return testData;

  }

  @Test(dataProvider="loginData")

  public void login(String uName, String pword) {

  System.setProperty("webdriver.chrome.driver", "F:\\Suriya\\chromedriver.exe");

driver = new ChromeDriver();

driver.manage().window().maximize();

driver.manage().deleteAllCookies();

driver.manage().timeouts().pageLoadTimeout(Duration.ofSeconds(30));

driver.manage().timeouts().implicitlyWait(Duration.ofSeconds(30));

driver.get("URL");

WebElement username = driver.findElement(By.id("userName"));

username.sendKeys(uName);


WebElement password = driver.findElement(By.id("pwd"));

password.sendKeys(pword);


WebElement login = driver.findElement(By.cssSelector(".btn:nth-child(3)"));

login.click();

    }

}


Selenium using Data Provider Method

 package coaching;


import java.time.Duration;

import org.openqa.selenium.By;

import org.openqa.selenium.WebDriver;

import org.openqa.selenium.WebElement;

import org.openqa.selenium.chrome.ChromeDriver;

import org.testng.annotations.DataProvider;

import org.testng.annotations.Test;


public class DataProviderTest {

WebDriver driver;

@DataProvider(name = "Authentication")

 public static Object[][] credentials() {

           // The number of times data is repeated, test will be executed the same no. of times

          // Here it will execute two times

           return new Object[][] { { "suriya", "Test@123" }, { "parithy", "Test@123" }};

     }

@Test(dataProvider = "Authentication")


  public void test(String username, String password) {

System.setProperty("webdriver.chrome.driver", "F:\\Suriya\\chromedriver.exe");

driver = new ChromeDriver();

driver.manage().window().maximize();

driver.manage().deleteAllCookies();

driver.manage().timeouts().pageLoadTimeout(Duration.ofSeconds(30));

driver.manage().timeouts().implicitlyWait(Duration.ofSeconds(30));

driver.get("URL");

WebElement username1 = driver.findElement(By.id("userName"));

username1.sendKeys(username);

                       WebElement password1 = driver.findElement(By.id("pwd"));

password1.sendKeys(password);

                      WebElement login = driver.findElement(By.cssSelector(".btn:nth-child(3)"));

login.click();

   }

}