common/org.gnit.lucenekmp.analysis.email/UAX29URLEmailTokenizer

UAX29URLEmailTokenizer

class UAX29URLEmailTokenizer : Tokenizer

This class implements Word Break rules from the Unicode Text Segmentation algorithm, as specified in Unicode Standard Annex #29 URLs and email addresses are also tokenized according to the relevant RFCs.

Constructors

constructor()

Creates a new instance of the UAX29URLEmailTokenizer. Attaches the input to the newly created JFlex scanner.

constructor(factory: AttributeFactory)

Creates a new UAX29URLEmailTokenizer with a given AttributeFactory

Types

Companion

object Companion

Properties

attributeClassesIterator

val attributeClassesIterator: Iterator<Any>

attributeFactory

val attributeFactory: AttributeFactory

attributeImplsIterator

val attributeImplsIterator: Iterator<AttributeImpl>

Functions

addAttribute

fun <T : Attribute> addAttribute(attClass: KClass<T>): T

addAttributeImpl

fun addAttributeImpl(att: AttributeImpl)

captureState

fun captureState(): AttributeSource.State?

clearAttributes

fun clearAttributes()

cloneAttributes

fun cloneAttributes(): AttributeSource

open override fun close()

copyTo

fun copyTo(target: AttributeSource)

end

open override fun end()

endAttributes

fun endAttributes()

equals

open operator override fun equals(obj: Any?): Boolean

getAttribute

fun <T : Attribute> getAttribute(attClass: KClass<T>): T?

getMaxTokenLength

fun getMaxTokenLength(): Int

hasAttribute

fun hasAttribute(attClass: KClass<out Attribute>): Boolean

hasAttributes

fun hasAttributes(): Boolean

hashCode

open override fun hashCode(): Int

incrementToken

open override fun incrementToken(): Boolean

reflectAsString

fun reflectAsString(prependAttClass: Boolean): String

reflectWith

fun reflectWith(reflector: AttributeReflector)

removeAllAttributes

fun removeAllAttributes()

reset

open override fun reset()

restoreState

fun restoreState(state: AttributeSource.State?)

setMaxTokenLength

fun setMaxTokenLength(length: Int)

Set the max allowed token length. Tokens larger than this will be chopped up at this token length and emitted as multiple tokens. If you need to skip such large tokens, you could increase this max length, and then use LengthFilter to remove long tokens. The default is UAX29URLEmailAnalyzer.DEFAULT_MAX_TOKEN_LENGTH.

setReader

fun setReader(input: Reader)

toString

open override fun toString(): String