public abstract class PDFBoxTree
extends org.apache.pdfbox.text.PDFTextStripper
| Modifier and Type | Field and Description |
|---|---|
protected static String[] |
cssFontFamily
Known font names that are recognized in the PDF files
|
protected static String[] |
cssFontStyle
Font styles corresponding to the font subtypes in
pdFontType |
protected static String[] |
cssFontWeight
Font weights corresponding to the font subtypes in
pdFontType |
protected float |
cur_x
Current text coordinates (the coordinates of the last encountered text box).
|
protected float |
cur_y
Current text coordinates (the coordinates of the last encountered text box).
|
protected BoxStyle |
curstyle
The style of the text line being created
|
protected boolean |
disableGraphics
When set to
true, the graphics in the PDF file will be ignored. |
protected boolean |
disableImageData
When set to
true, the image data will not be transferred to the HTML data: url. |
protected boolean |
disableImages
When set to
true, the embedded images will be ignored. |
protected int |
endPage
Last page to be processed
|
protected FontTable |
fontTable
Table of embedded fonts
|
protected Vector<PathSegment> |
graphicsPath
Current graphics path
|
protected org.apache.pdfbox.text.TextPosition |
lastDia
Last diacritic if any
|
protected org.apache.pdfbox.text.TextPosition |
lastText
Previous positioned text.
|
protected float |
path_start_x
Starting path construction position
|
protected float |
path_start_y
Starting path construction position
|
protected float |
path_x
Current path construction position
|
protected float |
path_y
Current path construction position
|
protected static String[] |
pdFontType
Known font subtypes recognized in PDF files
|
protected org.apache.pdfbox.pdmodel.PDPage |
pdpage
The PDF page currently being processed
|
protected int |
startPage
First page to be processed
|
protected BoxStyle |
style
The style of the future box being modified by the operators
|
protected StringBuilder |
textLine
The text box currently being created.
|
protected TextMetrics |
textMetrics
Current text line metrics
|
static String |
UNIT
Length units used in the generated CSS
|
| Constructor and Description |
|---|
PDFBoxTree() |
| Modifier and Type | Method and Description |
|---|---|
protected String |
colorString(float r,
float g,
float b)
Creates a CSS rgb() specification from the color component values.
|
protected String |
colorString(int ir,
int ig,
int ib)
Creates a CSS rgb() specification from the color component values.
|
protected String |
colorString(org.apache.pdfbox.pdmodel.graphics.color.PDColor pdcolor)
Creates a CSS rgb specification from a PDF color
|
protected AffineTransform |
createCurrentPageTransformation() |
protected void |
finishBox()
Finishes the current box - empties the text line buffer and creates a DOM element from it.
|
protected float |
floatValue(org.apache.pdfbox.cos.COSBase value)
Obtains a number from a PDF number value
|
protected org.apache.pdfbox.pdmodel.common.PDRectangle |
getCurrentMediaBox()
Obtains the media box valid for the current page.
|
boolean |
getDisableGraphics()
Checks whether the graphics processing is disabled.
|
boolean |
getDisableImageData()
Checks whether the copying of image data is disabled.
|
boolean |
getDisableImages()
Checks whether processing of embedded images is disabled.
|
int |
getEndPage() |
protected float |
getLength(org.apache.pdfbox.cos.COSBase value)
Obtains a length in points from a PDF number value
|
int |
getStartPage() |
protected byte |
getTextDirectionality(String s) |
protected byte |
getTextDirectionality(org.apache.pdfbox.text.TextPosition text) |
protected String |
getTitle() |
protected int |
intValue(org.apache.pdfbox.cos.COSBase value)
Obtains a number from a PDF number value
|
protected boolean |
isReversed(byte directionality)
Checks whether the text directionality corresponds to reversed text (very rough)
|
protected void |
processImageOperation(List<org.apache.pdfbox.cos.COSBase> arguments) |
protected void |
processOperator(org.apache.pdfbox.contentstream.operator.Operator operator,
List<org.apache.pdfbox.cos.COSBase> arguments) |
void |
processPage(org.apache.pdfbox.pdmodel.PDPage page) |
protected void |
processTextPosition(org.apache.pdfbox.text.TextPosition text) |
protected abstract void |
renderImage(float x,
float y,
float width,
float height,
ImageResource data)
Adds an image to the current page.
|
protected abstract void |
renderPath(List<PathSegment> path,
boolean stroke,
boolean fill)
Adds a rectangle to the current page on the specified position.
|
protected abstract void |
renderText(String data,
TextMetrics metrics)
Creates a new text box in the current page.
|
void |
setDisableGraphics(boolean disableGraphics)
Disables the processing of the graphic operators in the PDF files.
|
void |
setDisableImageData(boolean disableImageData)
Disables the copying the image data to the resulting DOM tree.
|
void |
setDisableImages(boolean disableImages)
Disables the processing of images contained in the PDF files.
|
void |
setEndPage(int endPage) |
void |
setStartPage(int startPage) |
protected void |
showGlyph(org.apache.pdfbox.util.Matrix arg0,
org.apache.pdfbox.pdmodel.font.PDFont arg1,
int arg2,
String arg3,
org.apache.pdfbox.util.Vector arg4) |
protected abstract void |
startNewPage()
Adds a new page to the resulting document and makes it a current (active) page.
|
protected String |
stringValue(org.apache.pdfbox.cos.COSBase value)
Obtains a string from a PDF value
|
protected float[] |
toRectangle(List<PathSegment> path) |
protected float |
transformLength(float w)
Transforms a length according to the current transformation matrix.
|
protected float[] |
transformPosition(float x,
float y)
Transforms a position according to the current transformation matrix and current page transformation.
|
protected void |
updateFontTable()
Updates the font table by adding new fonts used at the current page.
|
protected void |
updateStyle(BoxStyle bstyle,
org.apache.pdfbox.text.TextPosition text)
Updates the text style according to a new text position
|
endArticle, endDocument, endPage, getAddMoreFormatting, getArticleEnd, getArticleStart, getAverageCharTolerance, getCurrentPageNo, getDropThreshold, getEndBookmark, getCharactersByArticle, getIndentThreshold, getLineSeparator, getListItemPatterns, getOutput, getPageEnd, getPageStart, getParagraphEnd, getParagraphStart, getSeparateByBeads, getSortByPosition, getSpacingTolerance, getStartBookmark, getSuppressDuplicateOverlappingText, getText, getWordSeparator, matchPattern, processPages, setAddMoreFormatting, setArticleEnd, setArticleStart, setAverageCharTolerance, setDropThreshold, setEndBookmark, setIndentThreshold, setLineSeparator, setListItemPatterns, setPageEnd, setPageStart, setParagraphEnd, setParagraphStart, setShouldSeparateByBeads, setSortByPosition, setSpacingTolerance, setStartBookmark, setSuppressDuplicateOverlappingText, setWordSeparator, startArticle, startArticle, startDocument, startPage, writeCharacters, writeLineSeparator, writePage, writePageEnd, writePageStart, writeParagraphEnd, writeParagraphSeparator, writeParagraphStart, writeString, writeString, writeText, writeWordSeparatoraddOperator, applyTextAdjustment, beginMarkedContentSequence, beginText, endMarkedContentSequence, endText, getAppearance, getCurrentPage, getGraphicsStackSize, getGraphicsState, getInitialMatrix, getResources, getTextLineMatrix, getTextMatrix, operatorException, processAnnotation, processChildStream, processOperator, processSoftMask, processTilingPattern, processTilingPattern, processTransparencyGroup, processType3Stream, registerOperatorProcessor, restoreGraphicsStack, restoreGraphicsState, saveGraphicsStack, saveGraphicsState, setLineDashPattern, setTextLineMatrix, setTextMatrix, showAnnotation, showFontGlyph, showForm, showText, showTextString, showTextStrings, showTransparencyGroup, showType3Glyph, transformedPoint, transformWidth, unsupportedOperatorpublic static final String UNIT
protected static String[] cssFontFamily
protected static String[] pdFontType
protected static String[] cssFontWeight
pdFontTypeprotected static String[] cssFontStyle
pdFontTypeprotected boolean disableGraphics
true, the graphics in the PDF file will be ignored.protected boolean disableImages
true, the embedded images will be ignored.protected boolean disableImageData
true, the image data will not be transferred to the HTML data: url.protected int startPage
protected int endPage
protected FontTable fontTable
protected org.apache.pdfbox.pdmodel.PDPage pdpage
protected float cur_x
protected float cur_y
protected float path_x
protected float path_y
protected float path_start_x
protected float path_start_y
protected org.apache.pdfbox.text.TextPosition lastText
protected org.apache.pdfbox.text.TextPosition lastDia
protected StringBuilder textLine
protected TextMetrics textMetrics
protected Vector<PathSegment> graphicsPath
protected BoxStyle style
protected BoxStyle curstyle
public PDFBoxTree()
throws IOException
IOExceptionpublic void processPage(org.apache.pdfbox.pdmodel.PDPage page)
throws IOException
processPage in class org.apache.pdfbox.text.PDFTextStripperIOExceptionpublic boolean getDisableGraphics()
true when the graphics processing is disabled in the parser configuration.public void setDisableGraphics(boolean disableGraphics)
disableGraphics - when set to true the graphics is ignored in the source file.public boolean getDisableImages()
true when the processing of embedded images is disabled in the parser configuration.public void setDisableImages(boolean disableImages)
disableImages - when set to true the images are ignored in the source file.public boolean getDisableImageData()
true when the copying of image data is disabled in the parser configuration.public void setDisableImageData(boolean disableImageData)
disableImageData - when set to true the image data is not copied to the document tree.
The eventual img elements will have an empty src attribute.public int getStartPage()
getStartPage in class org.apache.pdfbox.text.PDFTextStripperpublic void setStartPage(int startPage)
setStartPage in class org.apache.pdfbox.text.PDFTextStripperpublic int getEndPage()
getEndPage in class org.apache.pdfbox.text.PDFTextStripperpublic void setEndPage(int endPage)
setEndPage in class org.apache.pdfbox.text.PDFTextStripperprotected abstract void startNewPage()
protected abstract void renderText(String data, TextMetrics metrics)
curstyle property.data - The text contents.protected abstract void renderPath(List<PathSegment> path, boolean stroke, boolean fill) throws IOException
rect - the rectangle to be renderedstroke - should there be a stroke around?fill - should the rectangle be filled?IOExceptionprotected abstract void renderImage(float x,
float y,
float width,
float height,
ImageResource data)
throws IOException
type - the image type: "png" or "jpeg"x - the X coordinate of the imagey - the Y coordinate of the imagewidth - the width coordinate of the imageheight - the height coordinate of the imagedata - the image data depending on the specified typeIOExceptionprotected float[] toRectangle(List<PathSegment> path)
protected void updateFontTable()
protected void processOperator(org.apache.pdfbox.contentstream.operator.Operator operator,
List<org.apache.pdfbox.cos.COSBase> arguments)
throws IOException
processOperator in class org.apache.pdfbox.contentstream.PDFStreamEngineIOExceptionprotected void processImageOperation(List<org.apache.pdfbox.cos.COSBase> arguments) throws IOException
IOExceptionprotected void processTextPosition(org.apache.pdfbox.text.TextPosition text)
processTextPosition in class org.apache.pdfbox.text.PDFTextStripperprotected void finishBox()
protected boolean isReversed(byte directionality)
directionality - the Character.directionalityprotected void updateStyle(BoxStyle bstyle, org.apache.pdfbox.text.TextPosition text)
bstyle - the style to be updatedtext - the text positionprotected org.apache.pdfbox.pdmodel.common.PDRectangle getCurrentMediaBox()
protected float transformLength(float w)
protected float[] transformPosition(float x,
float y)
x - y - protected AffineTransform createCurrentPageTransformation()
protected int intValue(org.apache.pdfbox.cos.COSBase value)
value - the PDF value of the Integer or Fload typeprotected float floatValue(org.apache.pdfbox.cos.COSBase value)
value - the PDF value of the Integer or Float typeprotected float getLength(org.apache.pdfbox.cos.COSBase value)
value - the PDF value of the Integer or Fload typeprotected String stringValue(org.apache.pdfbox.cos.COSBase value)
value - the PDF value of the String, Integer or Float typeprotected String colorString(int ir, int ig, int ib)
ir - red value (0..255)ig - green value (0..255)ib - blue value (0..255)protected String colorString(float r, float g, float b)
r - red value (0..1)g - green value (0..1)b - blue value (0..1)protected String colorString(org.apache.pdfbox.pdmodel.graphics.color.PDColor pdcolor)
pdcolor - protected String getTitle()
protected byte getTextDirectionality(org.apache.pdfbox.text.TextPosition text)
protected byte getTextDirectionality(String s)
protected void showGlyph(org.apache.pdfbox.util.Matrix arg0,
org.apache.pdfbox.pdmodel.font.PDFont arg1,
int arg2,
String arg3,
org.apache.pdfbox.util.Vector arg4)
throws IOException
showGlyph in class org.apache.pdfbox.contentstream.PDFStreamEngineIOExceptionCopyright © 2019. All rights reserved.