最近在做图纸识别的项目调研,对于文字识别来说,各种云服务商已经提供了很详尽的API和各种开发语言的SDK,开箱即用非常方便。但是对于开发者来说具体的识别过程还是个黑匣子,而且,是收费的。这里调研了几个开源的文字识别项目,目前还在调研过程
云服务商API 百度云,腾讯云,阿里云,华为云都提供了印刷文字的识别。阿里云的在项目中有用到过,文档写的非常详尽。参照之前写的身份证信息识别
Tesseract-OCR 谷歌公司产品。经过测试读取计算机的照片生成的文字,准确率不错。使用起来简单,实例化一个Tesseract实例,然后为已经训练好的LSTM模型设置数据路径,调用doOCR方法,接收一个文件参数返回字符串。开源。这里用sprigboot写了一个demo测试了一下识别效果。
pom.xml
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 COPY <?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-parent</artifactId> <version>2.2.6.RELEASE</version> <relativePath/> <!-- lookup parent from repository --> </parent> <groupId>com.hhzhu</groupId> <artifactId>demo</artifactId> <version>0.0.1-SNAPSHOT</version> <name>demo</name> <description>Demo project for Spring Boot</description> <properties> <java.version>1.8</java.version> </properties> <dependencies> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-web</artifactId> </dependency> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-thymeleaf</artifactId> </dependency> <!-- https://mvnrepository.com/artifact/net.sourceforge.tess4j/tess4j --> <dependency> <groupId>net.sourceforge.tess4j</groupId> <artifactId>tess4j</artifactId> <version>4.4.1</version> </dependency> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-test</artifactId> <scope>test</scope> <exclusions> <exclusion> <groupId>org.junit.vintage</groupId> <artifactId>junit-vintage-engine</artifactId> </exclusion> </exclusions> </dependency> </dependencies> <build> <plugins> <plugin> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-maven-plugin</artifactId> </plugin> </plugins> </build> </project>
controller
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 COPY package com.hhzhu.controller;import net.sourceforge.tess4j.Tesseract;import net.sourceforge.tess4j.TesseractException;import org.springframework.stereotype.Controller;import org.springframework.ui.Model;import org.springframework.web.bind.annotation.RequestMapping;import org.springframework.web.bind.annotation.RequestMethod;import org.springframework.web.bind.annotation.RequestParam;import org.springframework.web.multipart.MultipartFile;import org.springframework.web.servlet.mvc.support.RedirectAttributes;import org.springframework.web.servlet.view.RedirectView;import java.io.File;import java.io.FileOutputStream;import java.io.IOException;import java.nio.file.Files;import java.nio.file.Path;import java.nio.file.Paths;public class TestController { @Controller public static class FileUploadController { @RequestMapping("/") public String index () { return "upload" ; } @RequestMapping(value = "/upload", method = RequestMethod.POST) public RedirectView singleFileUpload (@RequestParam("file") MultipartFile file, RedirectAttributes redirectAttributes, Model model) throws IOException, TesseractException { byte [] bytes = file.getBytes(); Path path = Paths.get("C://Users//Victor//Desktop//demo//src//main//resources//static//" + file.getOriginalFilename()); Files.write(path, bytes); File convFile = convert(file); Tesseract tesseract = new Tesseract(); tesseract.setDatapath("C://Users//Victor//Desktop//demo//DataScience//testdata//" ); String text = tesseract.doOCR(convFile); redirectAttributes.addFlashAttribute("file" , file); redirectAttributes.addFlashAttribute("text" , text); return new RedirectView("result" ); } @RequestMapping("/result") public String result () { return "result" ; } public static File convert (MultipartFile file) throws IOException { File convFile = new File(file.getOriginalFilename()); convFile.createNewFile(); FileOutputStream fos = new FileOutputStream(convFile); fos.write(file.getBytes()); fos.close(); return convFile; } } }
static
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 COPY # upload页面 <!DOCTYPE html > <html lang ="en" > <head > <meta charset ="UTF-8" > <title > test</title > </head > <body > <h1 > Upload a file for OCR</h1 > <form method ="post" action ="/upload" enctype ="multipart/form-data" > <input type ="file" name ="file" /> <br /> <br /> <input type ="submit" value ="Submit" /> </form > </body > </html > # result页面 <!DOCTYPE html > <html lang ="en" xmlns:th ="http://www.thymeleaf.org" > <head > <meta charset ="UTF-8" > <title > result</title > </head > <body > <h1 > Extracted Content:</h1 > <h2 > <span th:text ="${text}" > </span > </h2 > <p > Form the image:</p > <img th:src ="'/' + ${file.getOriginalFilename()}" /> </body > </html >
这里的testdata可以到github下载,大概有1.6个G的样子(monkey找个脚本加速一下吧,不然要疯的),然后用两张图片作为测试,下面是识别结果的对比:
可以看出适应性比较差。对于排版标准的图片,识别很精准,但是对于畸形图片的识别准确率较差
Chineseocr-lite 还有很多正在调研和测试,,,,,,,