Package org.apache.tika.parser.microsoft
Class WordExtractor
java.lang.Object
org.apache.tika.parser.microsoft.WordExtractor
-
Nested Class Summary
Nested Classes -
Field Summary
FieldsModifier and TypeFieldDescriptionprotected final org.apache.tika.parser.ParseContextprotected final OfficeParserConfigprotected final org.apache.tika.metadata.Metadata -
Constructor Summary
ConstructorsConstructorDescriptionWordExtractor(org.apache.tika.parser.ParseContext context, org.apache.tika.metadata.Metadata metadata) -
Method Summary
Modifier and TypeMethodDescriptionstatic WordExtractor.TagAndStylebuildParagraphTagAndStyle(String styleName, boolean isTable) Given a style name, return what tag should be used, and what style should be applied to it.protected org.apache.tika.detect.Detectorprotected org.apache.tika.mime.MimeTypesDeprecated.protected StringReturns the password to be used for this file, or null if no / default password should be usedprotected org.apache.tika.config.TikaConfigprotected voidhandleEmbeddedOfficeDoc(org.apache.poi.poifs.filesystem.DirectoryEntry dir, String resourceName, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml) Handle an office document that's embedded at the POIFS levelprotected voidhandleEmbeddedOfficeDoc(org.apache.poi.poifs.filesystem.DirectoryEntry dir, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml) Handle an office document that's embedded at the POIFS levelprotected voidhandleEmbeddedResource(org.apache.tika.io.TikaInputStream resource, String filename, String relationshipID, String mediaType, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml) protected voidhandleEmbeddedResource(org.apache.tika.io.TikaInputStream resource, String filename, String relationshipID, org.apache.poi.hpsf.ClassID storageClassID, String mediaType, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml) protected voidhandleEmbeddedResource(org.apache.tika.io.TikaInputStream resource, org.apache.tika.metadata.Metadata embeddedMetadata, String filename, String relationshipID, org.apache.poi.hpsf.ClassID storageClassID, String mediaType, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml) protected voidparse(org.apache.poi.poifs.filesystem.DirectoryNode root, org.apache.tika.sax.XHTMLContentHandler xhtml) protected voidparse(org.apache.poi.poifs.filesystem.POIFSFileSystem filesystem, org.apache.tika.sax.XHTMLContentHandler xhtml) protected voidparseWord6(org.apache.poi.poifs.filesystem.DirectoryNode root, org.apache.tika.sax.XHTMLContentHandler xhtml) protected voidparseWord6(org.apache.poi.poifs.filesystem.POIFSFileSystem filesystem, org.apache.tika.sax.XHTMLContentHandler xhtml)
-
Field Details
-
parentMetadata
protected final org.apache.tika.metadata.Metadata parentMetadata -
officeParserConfig
-
context
protected final org.apache.tika.parser.ParseContext context
-
-
Constructor Details
-
WordExtractor
public WordExtractor(org.apache.tika.parser.ParseContext context, org.apache.tika.metadata.Metadata metadata)
-
-
Method Details
-
buildParagraphTagAndStyle
public static WordExtractor.TagAndStyle buildParagraphTagAndStyle(String styleName, boolean isTable) Given a style name, return what tag should be used, and what style should be applied to it. -
parse
protected void parse(org.apache.poi.poifs.filesystem.POIFSFileSystem filesystem, org.apache.tika.sax.XHTMLContentHandler xhtml) throws IOException, SAXException, org.apache.tika.exception.TikaException - Throws:
IOExceptionSAXExceptionorg.apache.tika.exception.TikaException
-
parse
protected void parse(org.apache.poi.poifs.filesystem.DirectoryNode root, org.apache.tika.sax.XHTMLContentHandler xhtml) throws IOException, SAXException, org.apache.tika.exception.TikaException - Throws:
IOExceptionSAXExceptionorg.apache.tika.exception.TikaException
-
parseWord6
protected void parseWord6(org.apache.poi.poifs.filesystem.POIFSFileSystem filesystem, org.apache.tika.sax.XHTMLContentHandler xhtml) throws IOException, SAXException, org.apache.tika.exception.TikaException - Throws:
IOExceptionSAXExceptionorg.apache.tika.exception.TikaException
-
parseWord6
protected void parseWord6(org.apache.poi.poifs.filesystem.DirectoryNode root, org.apache.tika.sax.XHTMLContentHandler xhtml) throws IOException, SAXException - Throws:
IOExceptionSAXException
-
getTikaConfig
protected org.apache.tika.config.TikaConfig getTikaConfig() -
getDetector
protected org.apache.tika.detect.Detector getDetector() -
getMimeTypes
protected org.apache.tika.mime.MimeTypes getMimeTypes()Deprecated.- Returns:
- mimetypes
-
getPassword
Returns the password to be used for this file, or null if no / default password should be used -
handleEmbeddedResource
protected void handleEmbeddedResource(org.apache.tika.io.TikaInputStream resource, String filename, String relationshipID, String mediaType, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml) throws IOException, SAXException, org.apache.tika.exception.TikaException - Throws:
IOExceptionSAXExceptionorg.apache.tika.exception.TikaException
-
handleEmbeddedResource
protected void handleEmbeddedResource(org.apache.tika.io.TikaInputStream resource, String filename, String relationshipID, org.apache.poi.hpsf.ClassID storageClassID, String mediaType, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml) throws IOException, SAXException, org.apache.tika.exception.TikaException - Throws:
IOExceptionSAXExceptionorg.apache.tika.exception.TikaException
-
handleEmbeddedResource
protected void handleEmbeddedResource(org.apache.tika.io.TikaInputStream resource, org.apache.tika.metadata.Metadata embeddedMetadata, String filename, String relationshipID, org.apache.poi.hpsf.ClassID storageClassID, String mediaType, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml) throws IOException, SAXException, org.apache.tika.exception.TikaException - Throws:
IOExceptionSAXExceptionorg.apache.tika.exception.TikaException
-
handleEmbeddedOfficeDoc
protected void handleEmbeddedOfficeDoc(org.apache.poi.poifs.filesystem.DirectoryEntry dir, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml) throws IOException, SAXException, org.apache.tika.exception.TikaException Handle an office document that's embedded at the POIFS level- Throws:
IOExceptionSAXExceptionorg.apache.tika.exception.TikaException
-
handleEmbeddedOfficeDoc
protected void handleEmbeddedOfficeDoc(org.apache.poi.poifs.filesystem.DirectoryEntry dir, String resourceName, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml) throws IOException, SAXException, org.apache.tika.exception.TikaException Handle an office document that's embedded at the POIFS level- Throws:
IOExceptionSAXExceptionorg.apache.tika.exception.TikaException
-
embeddedDocumentUtil