public class TesseractOCRParser
extends org.apache.tika.parser.AbstractExternalProcessParser
implements org.apache.tika.config.Initializable
TesseractOCRConfig object and pass it through a
ParseContext. Tesseract-ocr must be installed and on system path or the path
to its root folder must be provided:
TesseractOCRConfig config = new TesseractOCRConfig();
//Needed if tesseract is not on system path
config.setTesseractPath(tesseractFolder);
parseContext.set(TesseractOCRConfig.class, config);
| Modifier and Type | Field and Description |
|---|---|
static org.apache.tika.metadata.Property |
IMAGE_MAGICK |
static org.apache.tika.metadata.Property |
IMAGE_ROTATION |
static org.apache.tika.metadata.Property |
PSM0_ORIENTATION |
static org.apache.tika.metadata.Property |
PSM0_ORIENTATION_CONFIDENCE |
static org.apache.tika.metadata.Property |
PSM0_PAGE_NUMBER |
static org.apache.tika.metadata.Property |
PSM0_ROTATE |
static org.apache.tika.metadata.Property |
PSM0_SCRIPT |
static org.apache.tika.metadata.Property |
PSM0_SCRIPT_CONFIDENCE |
static String |
TESS_META |
| Constructor and Description |
|---|
TesseractOCRParser() |
| Modifier and Type | Method and Description |
|---|---|
void |
checkInitialization(org.apache.tika.config.InitializableProblemHandler problemHandler) |
String |
getColorspace() |
TesseractOCRConfig |
getDefaultConfig() |
int |
getDensity() |
int |
getDepth() |
String |
getFilter() |
String |
getImageMagickPath() |
static String |
getImageMagickProg() |
Set<String> |
getLangs() |
String |
getLanguage() |
long |
getMaxFileSizeToOcr() |
long |
getMinFileSizeToOcr() |
List<String> |
getOtherTesseractSettings() |
String |
getOutputType() |
String |
getPageSegMode() |
int |
getResize() |
Set<org.apache.tika.mime.MediaType> |
getSupportedTypes(org.apache.tika.parser.ParseContext context) |
String |
getTessdataPath() |
String |
getTesseractPath() |
static String |
getTesseractProg() |
int |
getTimeout() |
boolean |
hasTesseract() |
protected boolean |
hasWarned() |
void |
initialize(Map<String,org.apache.tika.config.Param> params) |
boolean |
isApplyRotation() |
boolean |
isEnableImagePreprocessing() |
boolean |
isPreloadLangs() |
boolean |
isPreserveInterwordSpacing() |
boolean |
isSkipOCR() |
void |
parse(Image image,
ContentHandler handler,
org.apache.tika.metadata.Metadata metadata,
org.apache.tika.parser.ParseContext context) |
void |
parse(InputStream stream,
ContentHandler handler,
org.apache.tika.metadata.Metadata metadata,
org.apache.tika.parser.ParseContext parseContext) |
void |
setApplyRotation(boolean applyRotation) |
void |
setColorspace(String colorspace) |
void |
setDensity(int density) |
void |
setDepth(int depth) |
void |
setEnableImagePreprocessing(boolean enableImagePreprocessing) |
void |
setFilter(String filter) |
void |
setImageMagickPath(String imageMagickPath)
Set the path to the ImageMagick executable directory, needed if it is not on system path.
|
void |
setLanguage(String language) |
void |
setMaxFileSizeToOcr(long maxFileSizeToOcr) |
void |
setMinFileSizeToOcr(long minFileSizeToOcr) |
void |
setOtherTesseractSettings(List<String> settings) |
void |
setOutputType(String outputType) |
void |
setPageSegMode(String pageSegMode) |
void |
setPreloadLangs(boolean preloadLangs)
If set to
true and if tesseract is found, this will load the
langs that result from --list-langs. |
void |
setPreserveInterwordSpacing(boolean preserveInterwordSpacing) |
void |
setResize(int resize) |
void |
setSkipOCR(boolean skipOCR) |
void |
setTessdataPath(String tessdataPath)
Set the path to the 'tessdata' folder, which contains language files and config files.
|
void |
setTesseractPath(String tesseractPath)
Set the path to the Tesseract executable's directory, needed if it is not on system path.
|
void |
setTimeout(int timeout)
Set default timeout in seconds.
|
protected void |
warn() |
public static final String TESS_META
public static final org.apache.tika.metadata.Property IMAGE_ROTATION
public static final org.apache.tika.metadata.Property IMAGE_MAGICK
public static final org.apache.tika.metadata.Property PSM0_PAGE_NUMBER
public static final org.apache.tika.metadata.Property PSM0_ORIENTATION
public static final org.apache.tika.metadata.Property PSM0_ROTATE
public static final org.apache.tika.metadata.Property PSM0_ORIENTATION_CONFIDENCE
public static final org.apache.tika.metadata.Property PSM0_SCRIPT
public static final org.apache.tika.metadata.Property PSM0_SCRIPT_CONFIDENCE
public static String getImageMagickProg()
public static String getTesseractProg()
public Set<org.apache.tika.mime.MediaType> getSupportedTypes(org.apache.tika.parser.ParseContext context)
getSupportedTypes in interface org.apache.tika.parser.Parserpublic boolean hasTesseract()
throws org.apache.tika.exception.TikaConfigException
org.apache.tika.exception.TikaConfigExceptionpublic void parse(Image image, ContentHandler handler, org.apache.tika.metadata.Metadata metadata, org.apache.tika.parser.ParseContext context) throws IOException, SAXException, org.apache.tika.exception.TikaException
IOExceptionSAXExceptionorg.apache.tika.exception.TikaExceptionpublic void parse(InputStream stream, ContentHandler handler, org.apache.tika.metadata.Metadata metadata, org.apache.tika.parser.ParseContext parseContext) throws IOException, SAXException, org.apache.tika.exception.TikaException
parse in interface org.apache.tika.parser.ParserIOExceptionSAXExceptionorg.apache.tika.exception.TikaExceptionpublic void initialize(Map<String,org.apache.tika.config.Param> params) throws org.apache.tika.exception.TikaConfigException
initialize in interface org.apache.tika.config.Initializableorg.apache.tika.exception.TikaConfigExceptionpublic void checkInitialization(org.apache.tika.config.InitializableProblemHandler problemHandler)
throws org.apache.tika.exception.TikaConfigException
checkInitialization in interface org.apache.tika.config.Initializableorg.apache.tika.exception.TikaConfigExceptionprotected boolean hasWarned()
protected void warn()
public String getTesseractPath()
@Field public void setTesseractPath(String tesseractPath)
Note that if you set this value, it is highly recommended that you also
set the path to (and including) the 'tessdata' folder using setTessdataPath(java.lang.String).
public String getTessdataPath()
@Field public void setTessdataPath(String tessdataPath)
public String getImageMagickPath()
@Field public void setImageMagickPath(String imageMagickPath)
imageMagickPath - to ImageMagick executable directory.@Field public void setOtherTesseractSettings(List<String> settings) throws org.apache.tika.exception.TikaConfigException
org.apache.tika.exception.TikaConfigException@Field public void setSkipOCR(boolean skipOCR)
public boolean isSkipOCR()
@Field public void setLanguage(String language)
public String getLanguage()
@Field public void setPageSegMode(String pageSegMode)
public String getPageSegMode()
@Field public void setMaxFileSizeToOcr(long maxFileSizeToOcr)
public long getMaxFileSizeToOcr()
@Field public void setMinFileSizeToOcr(long minFileSizeToOcr)
public long getMinFileSizeToOcr()
@Field public void setTimeout(int timeout)
TikaTaskTimeout sent in via the ParseContext
at parse time.timeout - public int getTimeout()
@Field public void setOutputType(String outputType)
public String getOutputType()
@Field public void setPreserveInterwordSpacing(boolean preserveInterwordSpacing)
public boolean isPreserveInterwordSpacing()
@Field public void setEnableImagePreprocessing(boolean enableImagePreprocessing)
public boolean isEnableImagePreprocessing()
@Field public void setDensity(int density)
public int getDensity()
@Field public void setDepth(int depth)
public int getDepth()
@Field public void setColorspace(String colorspace)
public String getColorspace()
@Field public void setFilter(String filter)
public String getFilter()
@Field public void setResize(int resize)
public int getResize()
@Field public void setApplyRotation(boolean applyRotation)
public boolean isApplyRotation()
@Field public void setPreloadLangs(boolean preloadLangs)
true and if tesseract is found, this will load the
langs that result from --list-langs. At parse time, the
parser will verify that tesseract has the requested lang
available.
If set to false (the default) and tesseract is found, if a user
requests a language that tesseract does not have data for,
a TikaException will be thrown with tesseract's native exception
message, which is a bit less readable.
preloadLangs - public boolean isPreloadLangs()
public TesseractOCRConfig getDefaultConfig()
Copyright © 2007–2025 The Apache Software Foundation. All rights reserved.