跳到内容。

API 示例

本文档提供了有关如何在 C++ 中使用 tesseract-ocr API (v3.02.02-4.0.0) 的简单示例。预计 tesseract-ocr 已正确安装,包括所有依赖项。预计用户熟悉 C++、在平台上编译和链接程序,尽管为 Linux 初学者提供了基本的编译示例

有关 tesseract-ocr API 的更多详细信息,请访问 baseapi.h

基本示例

代码

#include <tesseract/baseapi.h>
#include <leptonica/allheaders.h>

int main()
{
    char *outText;

    tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI();
    // Initialize tesseract-ocr with English, without specifying tessdata path
    if (api->Init(NULL, "eng")) {
        fprintf(stderr, "Could not initialize tesseract.\n");
        exit(1);
    }

    // Open input image with leptonica library
    Pix *image = pixRead("/usr/src/tesseract/testing/phototest.tif");
    api->SetImage(image);
    // Get OCR result
    outText = api->GetUTF8Text();
    printf("OCR output:\n%s", outText);

    // Destroy used object and release memory
    api->End();
    delete api;
    delete [] outText;
    pixDestroy(&image);

    return 0;
}

该程序必须链接到 tesseract-ocr 和 leptonica 库。

如果要将识别限制在图像的子矩形区域 - 在 SetImage 之后调用 SetRectangle(left, top, width, height)。每个 SetRectangle 清除识别结果,因此可以使用相同图像识别多个矩形。例如:

  api->SetRectangle(30, 86, 590, 100);

GetComponentImages 示例

  Pix *image = pixRead("/usr/src/tesseract/testing/phototest.tif");
  tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI();
  api->Init(NULL, "eng");
  api->SetImage(image);
  Boxa* boxes = api->GetComponentImages(tesseract::RIL_TEXTLINE, true, NULL, NULL);
  printf("Found %d textline image components.\n", boxes->n);
  for (int i = 0; i < boxes->n; i++) {
    BOX* box = boxaGetBox(boxes, i, L_CLONE);
    api->SetRectangle(box->x, box->y, box->w, box->h);
    char* ocrResult = api->GetUTF8Text();
    int conf = api->MeanTextConf();
    fprintf(stdout, "Box[%d]: x=%d, y=%d, w=%d, h=%d, confidence: %d, text: %s",
                    i, box->x, box->y, box->w, box->h, conf, ocrResult);
    boxDestroy(&box);
  }
  delete api;

结果迭代器示例

可以从 ResultIterator 获取每个单词的置信度值和边界框

  Pix *image = pixRead("/usr/src/tesseract/testing/phototest.tif");
  tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI();
  api->Init(NULL, "eng");
  api->SetImage(image);
  api->Recognize(0);
  tesseract::ResultIterator* ri = api->GetIterator();
  tesseract::PageIteratorLevel level = tesseract::RIL_WORD;
  if (ri != 0) {
    do {
      const char* word = ri->GetUTF8Text(level);
      float conf = ri->Confidence(level);
      int x1, y1, x2, y2;
      ri->BoundingBox(level, &x1, &y1, &x2, &y2);
      printf("word: '%s';  \tconf: %.2f; BoundingBox: %d,%d,%d,%d;\n",
               word, conf, x1, y1, x2, y2);
      delete[] word;
    } while (ri->Next(level));
  }
  delete api;

还可以使用其他迭代器级别(块、行、词等),请参阅 PageiteratorLevel

方向和脚本检测 (OSD) 示例

  const char* inputfile = "/usr/src/tesseract/testing/eurotext.tif";
  tesseract::Orientation orientation;
  tesseract::WritingDirection direction;
  tesseract::TextlineOrder order;
  float deskew_angle;

  PIX *image = pixRead(inputfile);
  tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI();
  api->Init("/usr/src/tesseract/", "eng");
  api->SetPageSegMode(tesseract::PSM_AUTO_OSD);
  api->SetImage(image);
  api->Recognize(0);

  tesseract::PageIterator* it =  api->AnalyseLayout();
  it->Orientation(&orientation, &direction, &order, &deskew_angle);
  printf("Orientation: %d;\nWritingDirection: %d\nTextlineOrder: %d\n" \
         "Deskew angle: %.4f\n",
         orientation, direction, order, deskew_angle);
  delete api;

结果代码的解释在 publictypes.h

单个符号的分类器选择的迭代器示例


  Pix *image = pixRead("/usr/src/tesseract/testing/phototest.tif");
  tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI();
  api->Init(NULL, "eng");
  api->SetImage(image);
  api->SetVariable("save_blob_choices", "T");
  api->SetRectangle(37, 228, 548, 31);
  api->Recognize(NULL);

  tesseract::ResultIterator* ri = api->GetIterator();
  tesseract::PageIteratorLevel level = tesseract::RIL_SYMBOL;
  if(ri != 0) {
      do {
          const char* symbol = ri->GetUTF8Text(level);
          float conf = ri->Confidence(level);
          if(symbol != 0) {
              printf("symbol %s, conf: %f", symbol, conf);
              bool indent = false;
              tesseract::ChoiceIterator ci(*ri);
              do {
                  if (indent) printf("\t\t ");
                  printf("\t- ");
                  const char* choice = ci.GetUTF8Text();
                  printf("%s conf: %f\n", choice, ci.Confidence());
                  indent = true;
              } while(ci.Next());
          }
          printf("---------------------------------------------\n");
          delete[] symbol;
      } while((ri->Next(level)));
  }
  delete api;

获取 LSTM 每个字符的备选符号选择的置信度示例

#include <tesseract/baseapi.h>
#include <leptonica/allheaders.h>
int main()
{
    tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI();
// Initialize tesseract-ocr with English, without specifying tessdata path
    if (api->Init(NULL, "eng")) {
        fprintf(stderr, "Could not initialize tesseract.\n");
        exit(1);
    }
// Open input image with leptonica library
  Pix *image = pixRead("/home/ubuntu/tesseract/test/testing/trainingital.tif");
  api->SetImage(image);
// Set lstm_choice_mode to alternative symbol choices per character, bbox is at word level.
  api->SetVariable("lstm_choice_mode", "2");
  api->Recognize(0);
  tesseract::PageIteratorLevel level = tesseract::RIL_WORD;
  tesseract::ResultIterator* res_it = api->GetIterator();
// Get confidence level for alternative symbol choices. Code is based on
// https://github.com/tesseract-ocr/tesseract/blob/a7a729f6c315e751764b72ea945da961638effc5/src/api/hocrrenderer.cpp#L325-L344
  std::vector<std::vector<std::pair<const char*, float>>>* choiceMap = nullptr;
  if (res_it != 0) {
    do {
      const char* word;
      float conf;
      int x1, y1, x2, y2, tcnt = 1, gcnt = 1, wcnt = 0;
     res_it->BoundingBox(level, &x1, &y1, &x2, &y2);
     choiceMap = res_it->GetBestLSTMSymbolChoices();
      for (auto timestep : *choiceMap) {
        if (timestep.size() > 0) {
          for (auto & j : timestep) {
            conf = int(j.second * 100);
            word =  j.first;
            printf("%d  symbol: '%s';  \tconf: %.2f; BoundingBox: %d,%d,%d,%d;\n",
                        wcnt, word, conf, x1, y1, x2, y2);
           gcnt++;
          }
          tcnt++;
        }
      wcnt++;
      printf("\n");
      }
    } while (res_it->Next(level));
  }
  // Destroy used object and release memory
  api->End();
  delete api;
  pixDestroy(&image);
  return 0;
}

获取带有每个字符备选符号选择的 HOCR 输出示例 (LSTM)

这类似于使用 -c lstm_choice_mode=2 hocr 从命令行运行 tesseract。

#include <tesseract/baseapi.h>
#include <leptonica/allheaders.h>

int main()
{
    char *outText;

    tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI();
// Initialize tesseract-ocr with English, without specifying tessdata path
    if (api->Init(NULL, "eng")) {
        fprintf(stderr, "Could not initialize tesseract.\n");
        exit(1);
    }

// Open input image with leptonica library
    Pix *image = pixRead("/tesseract/test/testing/trainingital.tif");
    api->SetImage(image);
    api->SetVariable("lstm_choice_mode", "2");
// Get HOCR result
    outText = api->GetHOCRText(0);
    printf("HOCR alternative symbol choices  per character :\n%s", outText);

// Destroy used object and release memory
    api->End();
    delete api;
    delete [] outText;
    pixDestroy(&image);

    return 0;
}

注意不同的置信度值

<span class='ocrx_cinfo' id='lstm_choices_1_4_4'>
<span class='ocr_glyph' id='choice_1_4_7' title='x_confs 83'>m</span>
<span class='ocr_glyph' id='choice_1_4_8' title='x_confs 16'>n</span>
</span>
<span class='ocrx_cinfo' id='lstm_choices_1_4_5'>
<span class='ocr_glyph' id='choice_1_4_9' title='x_confs 54'>n</span>
<span class='ocr_glyph' id='choice_1_4_10' title='x_confs 17'>i</span>
<span class='ocr_glyph' id='choice_1_4_11' title='x_confs 15'>m</span>
<span class='ocr_glyph' id='choice_1_4_12' title='x_confs 6'>a</span>
<span class='ocr_glyph' id='choice_1_4_13' title='x_confs 4'>u</span>
<span class='ocr_glyph' id='choice_1_4_14' title='x_confs 1'>r</span>
<span class='ocr_glyph' id='choice_1_4_15' title='x_confs 1'>e</span>
</span>

在 Linux 上编译 C++ API 程序

在 Linux 中,以标准方式包含和链接到 Tesseract 的 API。要针对 API 编译基本程序,可以使用以下命令

g++ -o myprogram myprogram.cpp -llept -ltesseract

如果 Tesseract 安装在不寻常的位置,可以使用 g++ 的 -I 和 -L 标志显式指定包含和 lib 目录,如下所示

g++ -o myprogram myprogram.cpp -I/home/nick/local/include/tesseract -L/home/nick/local/lib -llept -ltesseract

C-API 在 python 中

从 3.02.02 版本开始,tesseract-ocr 提供 C-API。这使得 在 python 中使用 tesseract-ocr 共享库(以及可以使用 C 库的其他语言)成为可能。

import os
import ctypes

lang = "eng"
filename = "/usr/src/tesseract-ocr/phototest.tif"
libname = "/usr/local/lib64/libtesseract.so.3"
TESSDATA_PREFIX = os.environ.get('TESSDATA_PREFIX')
if not TESSDATA_PREFIX:
    TESSDATA_PREFIX = "../"

tesseract = ctypes.cdll.LoadLibrary(libname)
tesseract.TessVersion.restype = ctypes.c_char_p
tesseract_version = tesseract.TessVersion()
api = tesseract.TessBaseAPICreate()
rc = tesseract.TessBaseAPIInit3(api, TESSDATA_PREFIX, lang)
if (rc):
    tesseract.TessBaseAPIDelete(api)
    print("Could not initialize tesseract.\n")
    exit(3)

text_out = tesseract.TessBaseAPIProcessPages(api, filename, None, 0)
result_text = ctypes.string_at(text_out)

print 'Tesseract-ocr version', tesseract_version
print result_text

pastebin 上可以找到将 python 文件对象传递给 C-API 的示例。

从 Tesseract 4.0 中提取方向的示例

# /usr/bin/env python3
# coding: utf-8

PATH_TO_LIBTESS = '/path/to/development/libtesseract.so'

import cffi  # requires "pip install cffi"

ffi = cffi.FFI()
ffi.cdef("""
struct Pix;
typedef struct Pix PIX;
PIX * pixRead ( const char *filename );
char * getLeptonicaVersion (  );

typedef struct TessBaseAPI TessBaseAPI;
typedef int BOOL;

const char* TessVersion();

TessBaseAPI* TessBaseAPICreate();
int TessBaseAPIInit3(TessBaseAPI* handle, const char* datapath, const char* language);

void TessBaseAPISetImage2(TessBaseAPI* handle, struct Pix* pix);

BOOL   TessBaseAPIDetectOrientationScript(TessBaseAPI* handle, char** best_script_name,
                                                            int* best_orientation_deg, float* script_confidence,
                                                            float* orientation_confidence);
""")

libtess = ffi.dlopen(PATH_TO_LIBTESS)
from ctypes.util import find_library
liblept = ffi.dlopen(find_library('lept'))

ffi.string(libtess.TessVersion())

ffi.string(liblept.getLeptonicaVersion())

api = libtess.TessBaseAPICreate()

libtess.TessBaseAPIInit3(api, ffi.NULL, ffi.NULL)

pix = liblept.pixRead('mono.png'.encode())

libtess.TessBaseAPISetImage2(api, pix)

script_name = ffi.new('char **')
orient_deg = ffi.new('int *')
script_conf = ffi.new('float *')
orient_conf = ffi.new('float *')
libtess.TessBaseAPIDetectOrientationScript(api, script_name, orient_deg, script_conf, orient_conf)

print(ffi.string(script_name).decode('utf-8'))
print(orient_deg[0])
print(script_conf[0])
print(orient_conf[0])

在 C 程序中使用 C-API 的示例

当然,C-API 也可用于常规的 C 程序,如以下非常基本的示例所示。

#include <stdio.h>
#include <allheaders.h>
#include <capi.h>

void die(const char *errstr) {
	fputs(errstr, stderr);
	exit(1);
}

int main(int argc, char *argv[]) {
	TessBaseAPI *handle;
	PIX *img;
	char *text;

	if((img = pixRead("img.png")) == NULL)
		die("Error reading image\n");

	handle = TessBaseAPICreate();
	if(TessBaseAPIInit3(handle, NULL, "eng") != 0)
		die("Error initializing tesseract\n");

	TessBaseAPISetImage2(handle, img);
	if(TessBaseAPIRecognize(handle, NULL) != 0)
		die("Error in Tesseract recognition\n");

	if((text = TessBaseAPIGetUTF8Text(handle)) == NULL)
		die("Error getting text\n");

	fputs(text, stdout);

	TessDeleteText(text);
	TessBaseAPIEnd(handle);
	TessBaseAPIDelete(handle);
	pixDestroy(&img);

	return 0;
}

在 Linux 上,您可以像使用 C++ API 构建程序那样编译它

从图像在 C++ 中创建可搜索的 pdf 的示例

#include <leptonica/allheaders.h>
#include <tesseract/baseapi.h>
#include <tesseract/renderer.h>

int main()
{
    const char* input_image = "/usr/src/tesseract-oc/testing/phototest.tif";
    const char* output_base = "my_first_tesseract_pdf";
    const char* datapath = "/Projects/OCR/tesseract/tessdata";
    int timeout_ms = 5000;
    const char* retry_config = nullptr;
    bool textonly = false;
    int jpg_quality = 92;

    tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI();
    if (api->Init(datapath, "eng")) {
        fprintf(stderr, "Could not initialize tesseract.\n");
        exit(1);
    }

    tesseract::TessPDFRenderer *renderer = new tesseract::TessPDFRenderer(
              output_base, api->GetDatapath(), textonly, jpg_quality);

    bool succeed = api->ProcessPages(input_image, retry_config, timeout_ms, renderer);
    if (!succeed) {
      fprintf(stderr, "Error during processing.\n");
      return EXIT_FAILURE;
    }
    api->End();
    delete api;
    return EXIT_SUCCESS;
}

在 C++ 中监控 OCR 进度示例

#include <tesseract/baseapi.h>
#include <tesseract/ocrclass.h>
#include <leptonica/allheaders.h>
#include <thread>

void monitorProgress(ETEXT_DESC *monitor, int page);
void ocrProcess(tesseract::TessBaseAPI *api, ETEXT_DESC *monitor);

void monitorProgress(ETEXT_DESC *monitor, int page) {
    while (1) {
        printf( "\r%3d%%", monitor[page].progress);
        fflush (stdout);
        if (monitor[page].progress==100)
            break;
    }
}

void ocrProcess(tesseract::TessBaseAPI *api, ETEXT_DESC *monitor) {
    api->Recognize(monitor);
}

int main() {
    tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI();
    ETEXT_DESC *monitor = new ETEXT_DESC();
    if (api->Init("/tesseract/tessdata_best", "eng")) {
        fprintf(stderr, "Could not initialize tesseract.\n");
        return 1;
    }
    api->SetPageSegMode(tesseract::PSM_AUTO);
    Pix *image = pixRead("/tesseract-ocr/test/testing/phototest.tif");
    if (!image) {
       fprintf(stderr, "Leptonica can't process input file!\n");
       return 2;
     }
    api->SetImage(image);
    int page = 0;
    std::thread t1(ocrProcess, api, monitor);
    std::thread t2(monitorProgress, monitor, page);
    t1.join();
    t2.join();
    pixDestroy(&image);
    char *outText = api->GetUTF8Text();
    printf("\n%s", outText);
    if (outText)
       delete [] outText;
    api->End();
    delete api;
    return 0;
}

更复杂的示例(例如取消 OCR 进程)可以在 TesseractGuigimagereader 或 android textfairy app 的源代码中找到。