mirror of
https://github.com/crazy-max/diun.git
synced 2025-01-12 11:38:11 +00:00
276 lines
7.7 KiB
Go
276 lines
7.7 KiB
Go
package uniseg
|
||
|
||
import "unicode/utf8"
|
||
|
||
// The states of the sentence break parser.
|
||
const (
|
||
sbAny = iota
|
||
sbCR
|
||
sbParaSep
|
||
sbATerm
|
||
sbUpper
|
||
sbLower
|
||
sbSB7
|
||
sbSB8Close
|
||
sbSB8Sp
|
||
sbSTerm
|
||
sbSB8aClose
|
||
sbSB8aSp
|
||
)
|
||
|
||
// sbTransitions implements the sentence break parser's state transitions. It's
|
||
// anologous to [grTransitions], see comments there for details.
|
||
//
|
||
// Unicode version 15.0.0.
|
||
func sbTransitions(state, prop int) (newState int, sentenceBreak bool, rule int) {
|
||
switch uint64(state) | uint64(prop)<<32 {
|
||
// SB3.
|
||
case sbAny | prCR<<32:
|
||
return sbCR, false, 9990
|
||
case sbCR | prLF<<32:
|
||
return sbParaSep, false, 30
|
||
|
||
// SB4.
|
||
case sbAny | prSep<<32:
|
||
return sbParaSep, false, 9990
|
||
case sbAny | prLF<<32:
|
||
return sbParaSep, false, 9990
|
||
case sbParaSep | prAny<<32:
|
||
return sbAny, true, 40
|
||
case sbCR | prAny<<32:
|
||
return sbAny, true, 40
|
||
|
||
// SB6.
|
||
case sbAny | prATerm<<32:
|
||
return sbATerm, false, 9990
|
||
case sbATerm | prNumeric<<32:
|
||
return sbAny, false, 60
|
||
case sbSB7 | prNumeric<<32:
|
||
return sbAny, false, 60 // Because ATerm also appears in SB7.
|
||
|
||
// SB7.
|
||
case sbAny | prUpper<<32:
|
||
return sbUpper, false, 9990
|
||
case sbAny | prLower<<32:
|
||
return sbLower, false, 9990
|
||
case sbUpper | prATerm<<32:
|
||
return sbSB7, false, 70
|
||
case sbLower | prATerm<<32:
|
||
return sbSB7, false, 70
|
||
case sbSB7 | prUpper<<32:
|
||
return sbUpper, false, 70
|
||
|
||
// SB8a.
|
||
case sbAny | prSTerm<<32:
|
||
return sbSTerm, false, 9990
|
||
case sbATerm | prSContinue<<32:
|
||
return sbAny, false, 81
|
||
case sbATerm | prATerm<<32:
|
||
return sbATerm, false, 81
|
||
case sbATerm | prSTerm<<32:
|
||
return sbSTerm, false, 81
|
||
case sbSB7 | prSContinue<<32:
|
||
return sbAny, false, 81
|
||
case sbSB7 | prATerm<<32:
|
||
return sbATerm, false, 81
|
||
case sbSB7 | prSTerm<<32:
|
||
return sbSTerm, false, 81
|
||
case sbSB8Close | prSContinue<<32:
|
||
return sbAny, false, 81
|
||
case sbSB8Close | prATerm<<32:
|
||
return sbATerm, false, 81
|
||
case sbSB8Close | prSTerm<<32:
|
||
return sbSTerm, false, 81
|
||
case sbSB8Sp | prSContinue<<32:
|
||
return sbAny, false, 81
|
||
case sbSB8Sp | prATerm<<32:
|
||
return sbATerm, false, 81
|
||
case sbSB8Sp | prSTerm<<32:
|
||
return sbSTerm, false, 81
|
||
case sbSTerm | prSContinue<<32:
|
||
return sbAny, false, 81
|
||
case sbSTerm | prATerm<<32:
|
||
return sbATerm, false, 81
|
||
case sbSTerm | prSTerm<<32:
|
||
return sbSTerm, false, 81
|
||
case sbSB8aClose | prSContinue<<32:
|
||
return sbAny, false, 81
|
||
case sbSB8aClose | prATerm<<32:
|
||
return sbATerm, false, 81
|
||
case sbSB8aClose | prSTerm<<32:
|
||
return sbSTerm, false, 81
|
||
case sbSB8aSp | prSContinue<<32:
|
||
return sbAny, false, 81
|
||
case sbSB8aSp | prATerm<<32:
|
||
return sbATerm, false, 81
|
||
case sbSB8aSp | prSTerm<<32:
|
||
return sbSTerm, false, 81
|
||
|
||
// SB9.
|
||
case sbATerm | prClose<<32:
|
||
return sbSB8Close, false, 90
|
||
case sbSB7 | prClose<<32:
|
||
return sbSB8Close, false, 90
|
||
case sbSB8Close | prClose<<32:
|
||
return sbSB8Close, false, 90
|
||
case sbATerm | prSp<<32:
|
||
return sbSB8Sp, false, 90
|
||
case sbSB7 | prSp<<32:
|
||
return sbSB8Sp, false, 90
|
||
case sbSB8Close | prSp<<32:
|
||
return sbSB8Sp, false, 90
|
||
case sbSTerm | prClose<<32:
|
||
return sbSB8aClose, false, 90
|
||
case sbSB8aClose | prClose<<32:
|
||
return sbSB8aClose, false, 90
|
||
case sbSTerm | prSp<<32:
|
||
return sbSB8aSp, false, 90
|
||
case sbSB8aClose | prSp<<32:
|
||
return sbSB8aSp, false, 90
|
||
case sbATerm | prSep<<32:
|
||
return sbParaSep, false, 90
|
||
case sbATerm | prCR<<32:
|
||
return sbParaSep, false, 90
|
||
case sbATerm | prLF<<32:
|
||
return sbParaSep, false, 90
|
||
case sbSB7 | prSep<<32:
|
||
return sbParaSep, false, 90
|
||
case sbSB7 | prCR<<32:
|
||
return sbParaSep, false, 90
|
||
case sbSB7 | prLF<<32:
|
||
return sbParaSep, false, 90
|
||
case sbSB8Close | prSep<<32:
|
||
return sbParaSep, false, 90
|
||
case sbSB8Close | prCR<<32:
|
||
return sbParaSep, false, 90
|
||
case sbSB8Close | prLF<<32:
|
||
return sbParaSep, false, 90
|
||
case sbSTerm | prSep<<32:
|
||
return sbParaSep, false, 90
|
||
case sbSTerm | prCR<<32:
|
||
return sbParaSep, false, 90
|
||
case sbSTerm | prLF<<32:
|
||
return sbParaSep, false, 90
|
||
case sbSB8aClose | prSep<<32:
|
||
return sbParaSep, false, 90
|
||
case sbSB8aClose | prCR<<32:
|
||
return sbParaSep, false, 90
|
||
case sbSB8aClose | prLF<<32:
|
||
return sbParaSep, false, 90
|
||
|
||
// SB10.
|
||
case sbSB8Sp | prSp<<32:
|
||
return sbSB8Sp, false, 100
|
||
case sbSB8aSp | prSp<<32:
|
||
return sbSB8aSp, false, 100
|
||
case sbSB8Sp | prSep<<32:
|
||
return sbParaSep, false, 100
|
||
case sbSB8Sp | prCR<<32:
|
||
return sbParaSep, false, 100
|
||
case sbSB8Sp | prLF<<32:
|
||
return sbParaSep, false, 100
|
||
|
||
// SB11.
|
||
case sbATerm | prAny<<32:
|
||
return sbAny, true, 110
|
||
case sbSB7 | prAny<<32:
|
||
return sbAny, true, 110
|
||
case sbSB8Close | prAny<<32:
|
||
return sbAny, true, 110
|
||
case sbSB8Sp | prAny<<32:
|
||
return sbAny, true, 110
|
||
case sbSTerm | prAny<<32:
|
||
return sbAny, true, 110
|
||
case sbSB8aClose | prAny<<32:
|
||
return sbAny, true, 110
|
||
case sbSB8aSp | prAny<<32:
|
||
return sbAny, true, 110
|
||
// We'll always break after ParaSep due to SB4.
|
||
|
||
default:
|
||
return -1, false, -1
|
||
}
|
||
}
|
||
|
||
// transitionSentenceBreakState determines the new state of the sentence break
|
||
// parser given the current state and the next code point. It also returns
|
||
// whether a sentence boundary was detected. If more than one code point is
|
||
// needed to determine the new state, the byte slice or the string starting
|
||
// after rune "r" can be used (whichever is not nil or empty) for further
|
||
// lookups.
|
||
func transitionSentenceBreakState(state int, r rune, b []byte, str string) (newState int, sentenceBreak bool) {
|
||
// Determine the property of the next character.
|
||
nextProperty := property(sentenceBreakCodePoints, r)
|
||
|
||
// SB5 (Replacing Ignore Rules).
|
||
if nextProperty == prExtend || nextProperty == prFormat {
|
||
if state == sbParaSep || state == sbCR {
|
||
return sbAny, true // Make sure we don't apply SB5 to SB3 or SB4.
|
||
}
|
||
if state < 0 {
|
||
return sbAny, true // SB1.
|
||
}
|
||
return state, false
|
||
}
|
||
|
||
// Find the applicable transition in the table.
|
||
var rule int
|
||
newState, sentenceBreak, rule = sbTransitions(state, nextProperty)
|
||
if newState < 0 {
|
||
// No specific transition found. Try the less specific ones.
|
||
anyPropState, anyPropProp, anyPropRule := sbTransitions(state, prAny)
|
||
anyStateState, anyStateProp, anyStateRule := sbTransitions(sbAny, nextProperty)
|
||
if anyPropState >= 0 && anyStateState >= 0 {
|
||
// Both apply. We'll use a mix (see comments for grTransitions).
|
||
newState, sentenceBreak, rule = anyStateState, anyStateProp, anyStateRule
|
||
if anyPropRule < anyStateRule {
|
||
sentenceBreak, rule = anyPropProp, anyPropRule
|
||
}
|
||
} else if anyPropState >= 0 {
|
||
// We only have a specific state.
|
||
newState, sentenceBreak, rule = anyPropState, anyPropProp, anyPropRule
|
||
// This branch will probably never be reached because okAnyState will
|
||
// always be true given the current transition map. But we keep it here
|
||
// for future modifications to the transition map where this may not be
|
||
// true anymore.
|
||
} else if anyStateState >= 0 {
|
||
// We only have a specific property.
|
||
newState, sentenceBreak, rule = anyStateState, anyStateProp, anyStateRule
|
||
} else {
|
||
// No known transition. SB999: Any × Any.
|
||
newState, sentenceBreak, rule = sbAny, false, 9990
|
||
}
|
||
}
|
||
|
||
// SB8.
|
||
if rule > 80 && (state == sbATerm || state == sbSB8Close || state == sbSB8Sp || state == sbSB7) {
|
||
// Check the right side of the rule.
|
||
var length int
|
||
for nextProperty != prOLetter &&
|
||
nextProperty != prUpper &&
|
||
nextProperty != prLower &&
|
||
nextProperty != prSep &&
|
||
nextProperty != prCR &&
|
||
nextProperty != prLF &&
|
||
nextProperty != prATerm &&
|
||
nextProperty != prSTerm {
|
||
// Move on to the next rune.
|
||
if b != nil { // Byte slice version.
|
||
r, length = utf8.DecodeRune(b)
|
||
b = b[length:]
|
||
} else { // String version.
|
||
r, length = utf8.DecodeRuneInString(str)
|
||
str = str[length:]
|
||
}
|
||
if r == utf8.RuneError {
|
||
break
|
||
}
|
||
nextProperty = property(sentenceBreakCodePoints, r)
|
||
}
|
||
if nextProperty == prLower {
|
||
return sbLower, false
|
||
}
|
||
}
|
||
|
||
return
|
||
}
|