1、前言

想要提取图片中的文字,需要使用图谱识别技术ocr

什么是OCR?

OCR (Optical Character Recognition,光学字符识别)是指电子设备(例如扫描仪或数码相机)检查纸上打印的字符,通过检测暗、亮的模式确定其形状,然后用字符识别方法将形状翻译成计算机文字的过程

方案说明
百度OCR收费
Tesseract-OCRGoogle维护的开源OCR引擎,支持Java,Python等语言调用
Tess4J封装了Tesseract-OCR ,支持Java调用

2、开源Tess4j

  • 引入依赖
<dependency>
    <groupId>net.sourceforge.tess4j</groupId>
    <artifactId>tess4j</artifactId>
    <version>4.1.1</version>
</dependency>
  • 配置词库
#Tess4j的词库配置
tess4j:
  data-path: D:\java\tessdata
  language: chi_sim

3、测试使用

import com.heima.audit.tess4j.Tess4jClient;
import com.heima.file.service.FileStorageService;
import com.heima.wemedia.WemediaApplication;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.test.context.junit4.SpringRunner;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.FileInputStream;

@SpringBootTest(classes = WemediaApplication.class)
@RunWith(SpringRunner.class)
public class Tess4jClientTest {

    @Autowired
    Tess4jClient tess4jClient;

    @Autowired
    FileStorageService fileStorageService;

    @Test
    public void upload() throws Exception {
        String uploadImgFile = fileStorageService.uploadImgFile("", "test.png", new FileInputStream("D:\\143.png"));
        System.out.println(uploadImgFile);
    }

    /**
     * 测试文本内容审核
     */
    @Test
    public void testScanText() throws Exception {

        byte[] bytes = fileStorageService.downLoadFile("http://192.168.200.130:9000/leadnews/2021/11/19/test.png");

        //图片识别文字审核---begin-----

        //从byte[]转换为butteredImage
        ByteArrayInputStream in = new ByteArrayInputStream(bytes);
        BufferedImage imageFile = ImageIO.read(in);
        //识别图片的文字
        String result = tess4jClient.doOCR(imageFile);
        System.out.println("图片文字识别结果 "+result);
    }
}