core/org.gnit.lucenekmp.util.automaton/RegExp

RegExp

Regular Expression extension to Automaton.

Regular expressions are built from the following abstract syntax:

* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *

description of regular expression grammar
regexp	::=	unionexp
	\|
unionexp	::=	interexp `\|` unionexp	(union)
	\|	interexp
interexp	::=	concatexp `&` interexp	(intersection)	[OPTIONAL]
	\|	concatexp
concatexp	::=	repeatexp concatexp	(concatenation)
	\|	repeatexp
repeatexp	::=	repeatexp `?`	(zero or one occurrence)
	\|	repeatexp `*****`	(zero or more occurrences)
	\|	repeatexp `+`	(one or more occurrences)
	\|	repeatexp `{n}`	(`n` occurrences)
	\|	repeatexp `{n,}`	(`n` or more occurrences)
	\|	repeatexp `{n,m}`	(`n` to `m` occurrences, including both)
	\|	complexp
charclassexp	::=	`[` charclasses `]`	(character class)
	\|	`[^` charclasses `]`	(negated character class)
	\|	simpleexp
charclasses	::=	charclass charclasses
	\|	charclass
charclass	::=	charexp `-` charexp	(character range, including end-points)
	\|	charexp
simpleexp	::=	charexp
	\|	`.`	(any single character)
	\|	`#`	(the empty language)	[OPTIONAL]
	\|	`@`	(any string)	[OPTIONAL]
	\|	`"` <Unicode string without double-quotes> `"`	(a string)
	\|	`(` `)`	(the empty string)
	\|	`(` unionexp `)`	(precedence override)
	\|	`<` <identifier> `>`	(named automaton)	[OPTIONAL]
	\|	`<n-m>`	(numerical interval)	[OPTIONAL]
charexp	::=	<Unicode character>	(a single non-reserved character)
	\|	`\d`	(a digit [0-9])
	\|	`\D`	(a non-digit [^0-9])
	\|	`\s`	(whitespace [ \t\n\r])
	\|	`\S`	(non whitespace [^\s])
	\|	`\w`	(a word character [a-zA-Z_0-9])
	\|	`\W`	(a non word character [^\w])
	\|	`\` <Unicode character>	(a single character)

The productions marked OPTIONAL are only allowed if specified by the syntax flags passed to the RegExp constructor. The reserved characters used in the (enabled) syntax must be escaped with backslash (**\**) or double-quotes (**"..."**). (In contrast to other regexp syntaxes, this is required also in character classes.) Be aware that dash (**-**) has a special meaning in charclass expressions. An identifier is a string not containing right angle bracket (**>** * ) or dash (**-**). Numerical intervals are specified by non-negative decimal integers and include both end points, and if *n* and *m* * have the same number of digits, then the conforming strings must have that length (i.e. prefixed by 0's).

Constructors

RegExp

@JvmOverloads

constructor(s: String, syntax_flags: Int = ALL, match_flags: Int = 0)

Constructs new RegExp from a string. Same as RegExp(s, ALL).

Types

Companion

object Companion

Kind

enum Kind : Enum<RegExp.Kind>

The type of expression represented by a RegExp node.

Properties

val c: Int

Character expression

digits

val digits: Int

exp1Val

val exp1Val: RegExp?

Child expressions held by a container type expression

exp2Val

val exp2Val: RegExp?

flags

val flags: Int

from

val from: IntArray?

Extents for range type expressions

identifiers

val identifiers: MutableSet<String?>

kind

val kind: RegExp.Kind

The type of expression

max

val max: Int

min

val min: Int

Limits for repeatable type expressions

originalString

val originalString: String?

The string that was used to construct the regex. Compare to toString.

pos

var pos: Int

sVal

val sVal: String?

String expression

val to: IntArray?

Functions

expandPreDefined

fun expandPreDefined(starts: MutableList<Int>, ends: MutableList<Int>)

getIdentifiers

fun getIdentifiers(set: MutableSet<String?>)

iterativeParseExp

fun iterativeParseExp(gather: () -> RegExp, stop: () -> Boolean, associativeReduce: (Int, RegExp, RegExp) -> RegExp): RegExp

matchPredefinedCharacterClass

fun matchPredefinedCharacterClass(): RegExp?

parseCharClasses

fun parseCharClasses(): RegExp

parseCharClassExp

fun parseCharClassExp(): RegExp

parseCharExp

fun parseCharExp(): Int

parseComplExp

fun parseComplExp(): RegExp

parseConcatExp

fun parseConcatExp(): RegExp

parseInterExp

fun parseInterExp(): RegExp

parseRepeatExp

fun parseRepeatExp(): RegExp

parseSimpleExp

fun parseSimpleExp(): RegExp

parseUnionExp

fun parseUnionExp(): RegExp

toAutomaton

fun toAutomaton(): Automaton

Constructs new Automaton from this RegExp. Same as toAutomaton(null) (empty automaton map).

fun toAutomaton(automata: MutableMap<String?, Automaton?>?): Automaton?

fun toAutomaton(automaton_provider: AutomatonProvider?): Automaton?

Constructs new Automaton from this RegExp.

toString

open override fun toString(): String

Constructs string from parsed regular expression.

toStringBuilder

fun toStringBuilder(b: StringBuilder)

toStringTree

fun toStringTree(): String

Like to string, but more verbose (shows the hierarchy more clearly).

fun toStringTree(b: StringBuilder, indent: String?)

regexp	::=	unionexp
	\|
unionexp	::=	interexp `\|` unionexp	(union)
	\|	interexp
interexp	::=	concatexp `&` interexp	(intersection)	[OPTIONAL]
	\|	concatexp
concatexp	::=	repeatexp concatexp	(concatenation)
	\|	repeatexp
repeatexp	::=	repeatexp `?`	(zero or one occurrence)
	\|	repeatexp `*****`	(zero or more occurrences)
	\|	repeatexp `+`	(one or more occurrences)
	\|	repeatexp `{n}`	(`n` occurrences)
	\|	repeatexp `{n,}`	(`n` or more occurrences)
	\|	repeatexp `{n,m}`	(`n` to `m` occurrences, including both)
	\|	complexp
charclassexp	::=	`[` charclasses `]`	(character class)
	\|	`[^` charclasses `]`	(negated character class)
	\|	simpleexp
charclasses	::=	charclass charclasses
	\|	charclass
charclass	::=	charexp `-` charexp	(character range, including end-points)
	\|	charexp
simpleexp	::=	charexp
	\|	`.`	(any single character)
	\|	`#`	(the empty language)	[OPTIONAL]
	\|	`@`	(any string)	[OPTIONAL]
	\|	`"` <Unicode string without double-quotes> `"`	(a string)
	\|	`(` `)`	(the empty string)
	\|	`(` unionexp `)`	(precedence override)
	\|	`<` <identifier> `>`	(named automaton)	[OPTIONAL]
	\|	`<n-m>`	(numerical interval)	[OPTIONAL]
charexp	::=	<Unicode character>	(a single non-reserved character)
	\|	`\d`	(a digit [0-9])
	\|	`\D`	(a non-digit [^0-9])
	\|	`\s`	(whitespace [ \t\n\r])
	\|	`\S`	(non whitespace [^\s])
	\|	`\w`	(a word character [a-zA-Z_0-9])
	\|	`\W`	(a non word character [^\w])
	\|	`\` <Unicode character>	(a single character)