您的位置 首页 java

java文字识别技术(亲测,识别率很高)

java 文字识别程序的关键是寻找一个可以调用的OCR引擎。tesseract-ocr就是一个这样的OCR引擎,在1985年到1995年由HP实验室开发,现在在Google。tesseract- OCR 3.0发布,支持中文。不过tesseract-ocr 3.0不是图形化界面的客户端,别人写的FreeOCR图形化客户端还不支持导入新的 3.0 traineddata。但这标志着,现在有自由的中文OCR软件了。

java中使用tesseract-ocr3.01的步骤如下:

1.下载安装tesseract-ocr-setup-3.01-1.exe(3.0以上版本才增加了中文识别)

2.在安装向导中可以选择需要下载的语言包。

3.到网上搜索下载java图形处理所需的2个包:jai_imageio-1.1-alpha.jar,swingx-1.6.1.jar

4.java程序清单:

Image io Helper 类:

  1. import java.awt.image.BufferedImage;
  2. import java.io. File ;
  3. import java.io.IOException;
  4. import java.util.Iterator;
  5. import java.util.Locale;
  6. import javax.imageio.IIOImage;
  7. import javax.imageio.ImageIO;
  8. import javax.imageio.Image reader ;
  9. import javax.imageio.ImageWriteParam;
  10. import javax.imageio.ImageWriter;
  11. import javax.imageio.metadata.IIOMetadata;
  12. import javax.imageio.stream.ImageInputStream;
  13. import javax.imageio.stream.ImageOutputStream;
  14. import com.sun.media.imageio.plugins. tiff .TIFFImageWriteParam;
  15. public class ImageIOHelper {
  16. public static File createImage(File imageFile, String imageFormat) {
  17. File tempFile = null;
  18. try {
  19. Iterator readers = ImageIO.getImageReadersByFormatName(imageFormat);
  20. ImageReader reader = readers.next();
  21. ImageInputStream iis = ImageIO.createImageInputStream(imageFile);
  22. reader.setInput(iis);
  23. //Read the stream metadata
  24. IIOMetadata streamMetadata = reader.getStreamMetadata();
  25. //Set up the writeParam
  26. TIFFImageWriteParam tiffWriteParam = new TIFFImageWriteParam(Locale.CHINESE);
  27. tiffWriteParam.setCompressionMode(ImageWriteParam.MODE_DISABLED);
  28. //Get tif writer and set output to file
  29. Iterator writers = ImageIO.getImageWritersByFormatName(“tiff”);
  30. ImageWriter writer = writers.next();
  31. BufferedImage bi = reader.read(0);
  32. IIOImage image = new IIOImage(bi,null,reader.getImageMetadata(0));
  33. tempFile = tempImageFile(imageFile);
  34. ImageOutputStream ios = ImageIO.createImageOutputStream(tempFile);
  35. writer.setOutput(ios);
  36. writer.write(streamMetadata, image, tiffWriteParam);
  37. ios.close();
  38. writer.dispose();
  39. reader.dispose();
  40. } catch (IOException e) {
  41. e.printStackTrace();
  42. }
  43. return tempFile;
  44. }
  45. private static File tempImageFile(File imageFile) {
  46. String path = imageFile.getPath();
  47. StringBuffer strB = new StringBuffer(path);
  48. strB.insert(path.lastIndexOf(‘.’),0);
  49. return new File(strB.toString().replaceFirst(“(?<=//.)(//w+)$”, “tif”));
  50. }
  51. }

OCR 类:

  1. package com.hhp.util;
  2. import java.io.BufferedReader;
  3. import java.io.File;
  4. import java.io.FileInputStream;
  5. import java.io.InputStreamReader;
  6. import java.util.ArrayList;
  7. import java.util.List;
  8. import org.jdesktop.swingx.util.OS;
  9. public class OCR {
  10. private final String LANG_OPTION = “-l”; //英文字母小写l,并非数字1
  11. private final String EOL = System.getProperty(“line.separator”);
  12. private String tessPath = “C://Program Files (x86)//Tesseract-OCR”;
  13. //private String tessPath = new File(“tesseract”).getAbsolutePath();
  14. public String recognizeText(File imageFile,String imageFormat)throws Exception{
  15. File tempImage = ImageIOHelper.createImage(imageFile,imageFormat);
  16. File outputFile = new File(imageFile.getParentFile(),”output”);
  17. StringBuffer strB = new StringBuffer();
  18. List cmd = new ArrayList();
  19. if(OS.isWindowsXP()){
  20. cmd.add(tessPath+”//tesseract”);
  21. }else if(OS.isLinux()){
  22. cmd.add(“tesseract”);
  23. }else{
  24. cmd.add(tessPath+”//tesseract”);
  25. }
  26. cmd.add(“”);
  27. cmd.add(outputFile.getName());
  28. cmd.add(LANG_OPTION);
  29. cmd.add(“chi_sim”);
  30. //cmd.add(“eng”);
  31. ProcessBuilder pb = new ProcessBuilder();
  32. pb.directory(imageFile.getParentFile());
  33. cmd.set(1, tempImage.getName());
  34. pb.command(cmd);
  35. pb.redirectErrorStream(true);
  36. Process process = pb.start();
  37. //tesseract.exe 1.jpg 1 -l chi_sim
  38. int w = process.waitFor();
  39. //删除临时正在工作文件
  40. tempImage.delete();
  41. if(w==0){
  42. BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(outputFile.getAbsolutePath()+”.txt”),”UTF-8″));
  43. String str;
  44. while((str = in.readLine())!=null){
  45. strB. append (str).append(EOL);
  46. }
  47. in.close();
  48. }else{
  49. String msg ;
  50. switch(w){
  51. case 1:
  52. msg = “Errors accessing files.There may be spaces in your image’s filename.”;
  53. break ;
  54. case 29:
  55. msg = “Cannot recongnize the image or its selected region.”;
  56. break;
  57. case 31:
  58. msg = “Unsupported image format.”;
  59. break;
  60. default:
  61. msg = “Errors occurred.”;
  62. }
  63. tempImage.delete();
  64. throw new RuntimeException(msg);
  65. }
  66. new File(outputFile.getAbsolutePath()+”.txt”).delete();
  67. return strB.toString();
  68. }
  69. }

测试类TestOCR :

  1. import java.io.File;
  2. import java.io.IOException;
  3. import com.hhp.util.OCR;
  4. public class OcrTest {
  5. public static void main(String[] args) {
  6. String path = “C://temp//OCRcode//4.png”;
  7. System.out.println(“ORC Test Begin……”);
  8. try {
  9. String valCode = new OCR().recognizeText(new File(path), “png”);
  10. System.out.println(valCode);
  11. } catch (IOException e) {
  12. e.printStackTrace();
  13. } catch (Exception e) {
  14. e.printStackTrace();
  15. }
  16. System.out.println(“ORC Test End……”);
  17. }
  18. }

经过测试,tesseract-ocr 3.01的文字识别率很高,对于网站中常见的验证码识别率也很高。

文章来源:智云一二三科技

文章标题:java文字识别技术(亲测,识别率很高)

文章地址:https://www.zhihuclub.com/181273.shtml

关于作者: 智云科技

热门文章

网站地图