-
- All Implemented Interfaces:
-
kotlin.Comparable
public final class KWebPage implements Comparable<KWebPage>
The core data structure across the whole program execution
Notice: Use a build-in java string or a Utf8 to serialize strings?
see org .apache .gora. hbase. util .HBaseByteInterface #fromBytes
In serializetion phrase, a byte array created by s.getBytes(UTF8_CHARSET) is serialized, and in deserialization phrase, every string are wrapped to be a Utf8
So both build-in string and a Utf8 wrap is OK to serialize, and Utf8 is always returned
-
-
Nested Class Summary
Nested Classes Modifier and Type Class Description public classKWebPage.Companion
-
Field Summary
-
Method Summary
-
-
Method Detail
-
getUrl
final String getUrl()
The url is the permanent internal address, and the location is the last working address
-
getReversedUrl
final String getReversedUrl()
The reversed url of the web page, it's also the key of the underlying storage of this object
-
setReversedUrl
final Unit setReversedUrl(String reversedUrl)
-
getVolatileConfig
final VolatileConfig getVolatileConfig()
Web page scope configuration
-
setVolatileConfig
final Unit setVolatileConfig(VolatileConfig volatileConfig)
Web page scope configuration
-
getVariables
final Variables getVariables()
Web page scope variables TODO : we may use it a PageDatum to track all context scope variables
-
getIsNotNil
final Boolean getIsNotNil()
-
getIsInternal
final Boolean getIsInternal()
-
getIsNotInternal
final Boolean getIsNotInternal()
-
getMetadata
final Metadata getMetadata()
-
getMarks
final CrawlMarks getMarks()
-
getOptions
final String getOptions()
-
setOptions
final Unit setOptions(String options)
-
getConfiguredUrl
final String getConfiguredUrl()
-
getBatchId
final String getBatchId()
-
setBatchId
final Unit setBatchId(String batchId)
-
getDistance
final Integer getDistance()
-
setDistance
final Unit setDistance(Integer distance)
-
getFetchMode
final FetchMode getFetchMode()
-
setFetchMode
final Unit setFetchMode(FetchMode fetchMode)
-
getLastBrowser
final BrowserType getLastBrowser()
-
setLastBrowser
final Unit setLastBrowser(BrowserType lastBrowser)
-
getHtmlIntegrity
final HtmlIntegrity getHtmlIntegrity()
-
setHtmlIntegrity
final Unit setHtmlIntegrity(HtmlIntegrity htmlIntegrity)
-
getFetchPriority
final Integer getFetchPriority()
-
setFetchPriority
final Unit setFetchPriority(Integer fetchPriority)
-
getCreateTime
final Instant getCreateTime()
-
setCreateTime
final Unit setCreateTime(Instant createTime)
-
getGenerateTime
final Instant getGenerateTime()
-
setGenerateTime
final Unit setGenerateTime(Instant generateTime)
-
getFetchCount
final Integer getFetchCount()
-
setFetchCount
final Unit setFetchCount(Integer fetchCount)
-
getCrawlStatus
final CrawlStatus getCrawlStatus()
-
setCrawlStatus
final Unit setCrawlStatus(CrawlStatus crawlStatus)
-
getBaseUrl
final String getBaseUrl()
-
setBaseUrl
final Unit setBaseUrl(String baseUrl)
-
getLocation
final String getLocation()
-
getFetchTime
final Instant getFetchTime()
-
setFetchTime
final Unit setFetchTime(Instant fetchTime)
-
getPrevFetchTime
final Instant getPrevFetchTime()
-
setPrevFetchTime
final Unit setPrevFetchTime(Instant prevFetchTime)
-
getPrevCrawlTime1
final Instant getPrevCrawlTime1()
-
setPrevCrawlTime1
final Unit setPrevCrawlTime1(Instant prevCrawlTime1)
-
getFetchInterval
final Duration getFetchInterval()
-
setFetchInterval
final Unit setFetchInterval(Duration fetchInterval)
-
getProtocolStatus
final ProtocolStatus getProtocolStatus()
-
setProtocolStatus
final Unit setProtocolStatus(ProtocolStatus protocolStatus)
-
getContentType
final String getContentType()
-
setContentType
final Unit setContentType(String contentType)
-
getContent
final ByteBuffer getContent()
-
setContent
final Unit setContent(ByteBuffer content)
-
getSignature
final ByteBuffer getSignature()
-
setSignature
final Unit setSignature(ByteBuffer signature)
-
getPrevSignature
final ByteBuffer getPrevSignature()
-
setPrevSignature
final Unit setPrevSignature(ByteBuffer prevSignature)
-
getSignatureAsString
final String getSignatureAsString()
-
getPrevSignatureAsString
final String getPrevSignatureAsString()
-
getPageTitle
final String getPageTitle()
-
setPageTitle
final Unit setPageTitle(String pageTitle)
-
getContentTitle
final String getContentTitle()
-
setContentTitle
final Unit setContentTitle(String contentTitle)
-
getParseStatus
final ParseStatus getParseStatus()
-
setParseStatus
final Unit setParseStatus(ParseStatus parseStatus)
-
getLiveLinks
final Map<CharSequence, GHypeLink> getLiveLinks()
-
setLiveLinks
final Unit setLiveLinks(Map<CharSequence, GHypeLink> liveLinks)
-
getVividLinks
final Map<CharSequence, CharSequence> getVividLinks()
-
setVividLinks
final Unit setVividLinks(Map<CharSequence, CharSequence> vividLinks)
-
getDeadLinks
final List<CharSequence> getDeadLinks()
-
setDeadLinks
final Unit setDeadLinks(List<CharSequence> deadLinks)
-
getLinks
final List<CharSequence> getLinks()
-
setLinks
final Unit setLinks(List<CharSequence> links)
-
getInlinkAnchors
final Array<String> getInlinkAnchors()
-
setInlinkAnchors
final Unit setInlinkAnchors(Array<String> inlinkAnchors)
-
getAnchorOrder
final Integer getAnchorOrder()
-
setAnchorOrder
final Unit setAnchorOrder(Integer anchorOrder)
-
getContentPublishTime
final Instant getContentPublishTime()
-
setContentPublishTime
final Unit setContentPublishTime(Instant contentPublishTime)
-
getPrevContentPublishTime
final Instant getPrevContentPublishTime()
-
setPrevContentPublishTime
final Unit setPrevContentPublishTime(Instant prevContentPublishTime)
-
getRefContentPublishTime
final Instant getRefContentPublishTime()
-
setRefContentPublishTime
final Unit setRefContentPublishTime(Instant refContentPublishTime)
-
getContentModifiedTime
final Instant getContentModifiedTime()
-
setContentModifiedTime
final Unit setContentModifiedTime(Instant contentModifiedTime)
-
getPrevContentModifiedTime
final Instant getPrevContentModifiedTime()
-
setPrevContentModifiedTime
final Unit setPrevContentModifiedTime(Instant prevContentModifiedTime)
-
getPrevRefContentPublishTime
final Instant getPrevRefContentPublishTime()
-
setPrevRefContentPublishTime
final Unit setPrevRefContentPublishTime(Instant prevRefContentPublishTime)
-
getReferrer
final String getReferrer()
-
setReferrer
final Unit setReferrer(String referrer)
-
getPageModel
final PageModel getPageModel()
-
getContentScore
final Float getContentScore()
-
setContentScore
final Unit setContentScore(Float contentScore)
-
getSortScore
final String getSortScore()
-
setSortScore
final Unit setSortScore(String sortScore)
-
getPageCounters
final PageCounters getPageCounters()
-
getAndRemoveVar
final Boolean getAndRemoveVar(String name)
-
unmarkSeed
final Unit unmarkSeed()
-
updateDistance
final Unit updateDistance(Integer newDistance)
-
sniffFetchPriority
final Integer sniffFetchPriority()
-
increaseFetchCount
final Unit increaseFetchCount()
-
setCrawlStatus
final Unit setCrawlStatus(Integer value)
Set crawl status
-
getLastFetchTime
final Instant getLastFetchTime(Instant now)
Get last fetch time
If fetchTime is before now, the result is the fetchTime If fetchTime is after now, it means that schedule has modified it for the next fetch, the result is prevFetchTime
-
getHeaders
final ProtocolHeaders getHeaders()
Header information returned from the web server used to server the content which is subsequently fetched from. This includes keys such as TRANSFER_ENCODING, CONTENT_ENCODING, CONTENT_LANGUAGE, CONTENT_LENGTH, CONTENT_LOCATION, CONTENT_DISPOSITION, CONTENT_MD5, CONTENT_TYPE, LAST_MODIFIED and LOCATION.
-
getReprUrl
final String getReprUrl()
-
setReprUrl
final Unit setReprUrl(String value)
-
getFetchRetries
final Integer getFetchRetries()
Get the number of crawl scope retries
-
setFetchRetries
final Unit setFetchRetries(Integer value)
Set the number of crawl scope retries
-
getLastTimeout
final Duration getLastTimeout()
-
getModifiedTime
final Instant getModifiedTime()
-
setModifiedTime
final Unit setModifiedTime(Instant value)
-
getPrevModifiedTime
final Instant getPrevModifiedTime()
-
setPrevModifiedTime
final Unit setPrevModifiedTime(Instant value)
-
sniffModifiedTime
final Instant sniffModifiedTime()
-
getFetchTimeHistory
final String getFetchTimeHistory(String defaultValue)
-
putFetchTimeHistory
final Unit putFetchTimeHistory(Instant fetchTime)
Parsing
-
getFirstCrawlTime
final Instant getFirstCrawlTime(Instant defaultValue)
-
getPageCategory
final PageCategory getPageCategory()
-
setPageCategory
final Unit setPageCategory(PageCategory pageCategory)
category : index, detail, review, media, search, etc
-
getEncoding
final String getEncoding()
-
setEncoding
final Unit setEncoding(String encoding)
-
getEncodingOrDefault
final String getEncodingOrDefault(String defaultEncoding)
Get content encoding Content encoding is detected just before it's parsed
-
getEncodingClues
final String getEncodingClues()
-
setEncodingClues
final Unit setEncodingClues(String clues)
-
hasContent
final Boolean hasContent()
-
setContent
final Unit setContent(String value)
-
setContent
final Unit setContent(ByteArray value)
-
getContentAsBytes
final ByteArray getContentAsBytes()
-
getContentAsString
final String getContentAsString()
TODO: Encoding is always UTF-8?
-
getContentAsInputStream
final ByteArrayInputStream getContentAsInputStream()
-
getContentAsSaxInputSource
final InputSource getContentAsSaxInputSource()
-
getContentBytes
final Integer getContentBytes()
-
getAveContentBytes
final Integer getAveContentBytes()
-
getActiveDomMultiStatus
final ActiveDomMultiStatus getActiveDomMultiStatus()
-
setActiveDomMultiStatus
final Unit setActiveDomMultiStatus(ActiveDomMultiStatus domStatus)
-
getActiveDomUrls
final ActiveDomUrls getActiveDomUrls()
-
setActiveDomUrls
final Unit setActiveDomUrls(ActiveDomUrls urls)
-
setSignature
final Unit setSignature(ByteArray value)
-
sniffTitle
final String sniffTitle()
-
getPageText
final String getPageText()
-
setPageText
final Unit setPageText(String value)
-
getContentText
final String getContentText()
-
setContentText
final Unit setContentText(String textContent)
-
getContentTextLen
final Integer getContentTextLen()
-
setTextCascaded
final Unit setTextCascaded(String text)
Set all text fields cascaded, including content, content text and page text.
-
getSimpleLiveLinks
final Collection<String> getSimpleLiveLinks()
-
setLiveLinks
final Unit setLiveLinks(Iterable<HyperlinkPersistable> liveLinks)
TODO: Remove redundant url to reduce space
-
addLiveLink
final Unit addLiveLink(HyperlinkPersistable hyperlink)
-
getSimpleVividLinks
final Collection<String> getSimpleVividLinks()
-
addHyperlinks
final Unit addHyperlinks(Iterable<HyperlinkPersistable> hyperLinks)
Record all links appeared in a page The links are in FIFO order, for each time we fetch and parse a page, we push newly discovered links to the queue, if the queue is full, we drop out some old ones, usually they do not appears in the page any more.
TODO: compress links TODO: HBase seems not modify any nested array
-
addLinks
final Unit addLinks(Iterable<CharSequence> hypeLinks)
-
getImpreciseLinkCount
final Integer getImpreciseLinkCount()
-
setImpreciseLinkCount
final Unit setImpreciseLinkCount(Integer count)
-
increaseImpreciseLinkCount
final Unit increaseImpreciseLinkCount(Integer count)
-
getInlinks
final Map<CharSequence, CharSequence> getInlinks()
-
updateContentPublishTime
final Boolean updateContentPublishTime(Instant newPublishTime)
-
updateContentModifiedTime
final Boolean updateContentModifiedTime(Instant newModifiedTime)
-
updateRefContentPublishTime
final Boolean updateRefContentPublishTime(Instant newRefPublishTime)
-
getIndexTimeHistory
final String getIndexTimeHistory(String defaultValue)
Index
-
putIndexTimeHistory
final Unit putIndexTimeHistory(Instant indexTime)
-
getFirstIndexTime
final Instant getFirstIndexTime(Instant defaultValue)
-
-
-
-