Files
go_goutils/remap/funcs_remap.go
brent saner 2edbc9306d v1.15.4
FIXED:
* Docs error
2026-01-07 19:15:21 -05:00

816 lines
20 KiB
Go

package remap
/*
Map returns a map[string][]<match bytes> for regexes with named capture groups matched in bytes b.
Note that this supports non-unique group names; [regexp.Regexp] allows for patterns with multiple groups
using the same group name (though your IDE might complain; I know GoLand does).
It will panic if the embedded [regexp.Regexp] is nil.
Each match for each group is in a slice keyed under that group name, with that slice
ordered by the indexing done by the regex match itself.
This operates on only the first found match (like [regexp.Regexp.FindSubmatch]).
To operate on *all* matches, use [ReMap.MapAll].
In summary, the parameters are as follows:
# inclNoMatch
If true, then attempt to return a non-nil matches (as long as b isn't nil).
Group keys will be populated and explicitly defined as nil.
For example, if a pattern
^(?P<g1>foo)(?P<g1>bar)(?P<g2>baz)$
is provided but b does not match then matches will be:
map[string][][]byte{
"g1": nil,
"g2": nil,
}
# inclNoMatchStrict
If true (and inclNoMatch is true), instead of a single nil the group's values will be
a slice of nil values explicitly matching the number of times the group name is specified
in the pattern.
May be unpredictable if the same name is used multiple times for different capture groups across multiple patterns.
For example, if a pattern:
^(?P<g1>foo)(?P<g1>bar)(?P<g2>baz)$
is provided but b does not match then matches will be:
map[string][][]byte{
"g1": [][]byte{
nil,
nil,
},
"g2": [][]byte{
nil,
},
}
# mustMatch
If true, matches will be nil if the entirety of b does not match the pattern (and thus
no capture groups matched) (overrides inclNoMatch) -- explicitly:
matches == nil
Otherwise if false (and assuming inclNoMatch is false), matches will be:
map[string][][]byte{}{}
# Condition Tree
In detail, matches and/or its values may be nil or empty under the following condition tree:
IF b is nil:
THEN matches will always be nil
ELSE:
IF all of b does not match pattern
IF mustMuch is true
THEN matches == nil
ELSE
THEN matches == map[string][][]byte{} (non-nil but empty)
ELSE IF pattern has no named capture groups
IF inclNoMatch is true
THEN matches == map[string][][]byte{} (non-nil but empty)
ELSE
THEN matches == nil
ELSE
IF there are no named group matches
IF inclNoMatch is true
THEN matches is non-nil; matches[<group name>, ...] is/are defined but nil (_, ok = matches[<group name>]; ok == true)
ELSE
THEN matches == nil
ELSE
IF <group name> does not have a match
IF inclNoMatch is true
IF inclNoMatchStrict is true
THEN matches[<group name>] is defined and non-nil, but populated with placeholder nils
(matches[<group name>] == [][]byte{nil[, nil, ...]})
ELSE
THEN matches[<group name>] is guaranteed defined but may be nil (_, ok = matches[<group name>]; ok == true)
ELSE
THEN matches[<group name>] is not defined (_, ok = matches[<group name>]; ok == false)
ELSE
matches[<group name>] == []{<match>[, <match>...]}
*/
func (r *ReMap) Map(b []byte, inclNoMatch, inclNoMatchStrict, mustMatch bool) (matches map[string][][]byte) {
var ok bool
var mIdx int
var match []byte
var grpNm string
var names []string
var matchBytes [][]byte
var tmpMap map[string][][]byte = make(map[string][][]byte)
if b == nil {
return
}
names = r.Regexp.SubexpNames()[:]
matchBytes = r.Regexp.FindSubmatch(b)
if matchBytes == nil {
// b does not match pattern
if !mustMatch {
matches = make(map[string][][]byte)
}
return
}
if names == nil || len(names) == 0 || len(names) == 1 {
/*
no named capture groups;
technically only the last condition would be the case.
*/
if inclNoMatch {
matches = make(map[string][][]byte)
}
return
}
names = names[1:]
if len(matchBytes) == 0 || len(matchBytes) == 1 {
/*
no submatches whatsoever.
*Technically* I don't think this condition can actually be reached.
This is more of a safe-return before we re-slice.
*/
matches = make(map[string][][]byte)
if inclNoMatch {
if len(names) >= 1 {
for _, grpNm = range names {
if grpNm == "" {
continue
}
matches[grpNm] = nil
}
}
}
return
}
matchBytes = matchBytes[1:]
for mIdx, match = range matchBytes {
grpNm = names[mIdx]
/*
Thankfully, it's actually a build error if a pattern specifies a named
capture group with an matched name.
So we don't need to worry about accounting for that,
and can just skip over grpNm == "" (which is an *unnamed* capture group).
*/
if grpNm == "" {
continue
}
if match == nil {
// group did not match
if !inclNoMatch {
continue
}
if _, ok = tmpMap[grpNm]; !ok {
if !inclNoMatchStrict {
tmpMap[grpNm] = nil
} else {
tmpMap[grpNm] = [][]byte{nil}
}
} else {
if inclNoMatchStrict {
tmpMap[grpNm] = append(tmpMap[grpNm], nil)
}
}
continue
}
if _, ok = tmpMap[grpNm]; !ok {
tmpMap[grpNm] = make([][]byte, 0)
}
tmpMap[grpNm] = append(tmpMap[grpNm], match)
}
// This *technically* should be completely handled above.
if inclNoMatch {
for _, grpNm = range names {
if grpNm == "" {
continue
}
if _, ok = tmpMap[grpNm]; !ok {
tmpMap[grpNm] = nil
}
}
}
if len(tmpMap) > 0 {
matches = tmpMap
}
return
}
/*
MapAll behaves exactly like [ReMap.Map] but will "squash"/consolidate *all* found matches, not just the first occurrence,
into the group name.
You likely want to use this instead of [ReMap.Map] for multiline patterns.
*/
func (r *ReMap) MapAll(b []byte, inclNoMatch, inclNoMatchStrict, mustMatch bool) (matches map[string][][]byte) {
var ok bool
var mIdx int
var isEmpty bool
var match []byte
var grpNm string
var names []string
var mbGrp [][]byte
var ptrnNms []string
var matchBytes [][][]byte
var tmpMap map[string][][]byte = make(map[string][][]byte)
if b == nil {
return
}
names = r.Regexp.SubexpNames()[:]
matchBytes = r.Regexp.FindAllSubmatch(b, -1)
if matchBytes == nil {
// b does not match pattern
if !mustMatch {
matches = make(map[string][][]byte)
}
return
}
if names == nil || len(names) == 0 || len(names) == 1 {
/*
no named capture groups;
technically only the last condition would be the case.
*/
if inclNoMatch {
matches = make(map[string][][]byte)
}
return
}
names = names[1:]
tmpMap = make(map[string][][]byte)
// From here, it behaves (sort of) like ReMap.Map
// except mbGrp is like matchBytes in Map.
for _, mbGrp = range matchBytes {
// Unlike ReMap.Map, we have to do a little additional logic.
isEmpty = false
ptrnNms = make([]string, 0, len(names))
if mbGrp == nil {
isEmpty = true
}
if !isEmpty {
if len(mbGrp) == 0 || len(mbGrp) == 1 {
/*
no submatches whatsoever.
*/
isEmpty = true
} else {
mbGrp = mbGrp[1:]
for mIdx, match = range mbGrp {
if mIdx > len(names) {
break
}
grpNm = names[mIdx]
if grpNm == "" {
continue
}
ptrnNms = append(ptrnNms, grpNm)
if match == nil {
// This specific group didn't match, but it matched the whole pattern.
if !inclNoMatch {
continue
}
if _, ok = tmpMap[grpNm]; !ok {
if !inclNoMatchStrict {
tmpMap[grpNm] = nil
} else {
tmpMap[grpNm] = [][]byte{nil}
}
} else {
if inclNoMatchStrict {
tmpMap[grpNm] = append(tmpMap[grpNm], nil)
}
}
continue
}
if _, ok = tmpMap[grpNm]; !ok {
tmpMap[grpNm] = make([][]byte, 0)
}
tmpMap[grpNm] = append(tmpMap[grpNm], match)
}
}
}
// I can't recall why I capture this.
_ = ptrnNms
}
// *Theoretically* all of these should be populated with at least a nil.
if inclNoMatch {
for _, grpNm = range names {
if grpNm == "" {
continue
}
if _, ok = tmpMap[grpNm]; !ok {
tmpMap[grpNm] = nil
}
}
}
if len(tmpMap) > 0 {
matches = tmpMap
}
return
}
/*
MapString is exactly like [ReMap.Map], but operates on (and returns) strings instead.
(matches will always be nil if s == "".)
It will panic if the embedded [regexp.Regexp] is nil.
This operates on only the first found match (like [regexp.Regexp.FindStringSubmatch]).
To operate on *all* matches, use [ReMap.MapStringAll].
A small deviation and caveat, though; empty strings instead of nils (because duh) will occupy slice placeholders (if `inclNoMatchStrict` is specified).
This unfortunately *does not provide any indication* if an empty string positively matched the pattern (a "hit") or if it was simply
not matched at all (a "miss"). If you need definitive determination between the two conditions, it is instead recommended to either
*not* use inclNoMatchStrict or to use [ReMap.Map] instead and convert any non-nil values to strings after.
Particularly:
# inclNoMatch
If true, then attempt to return a non-nil matches (as long as s isn't empty).
Group keys will be populated and explicitly defined as nil.
For example, if a pattern
^(?P<g1>foo)(?P<g1>bar)(?P<g2>baz)$
is provided but s does not match then matches will be:
map[string][]string{
"g1": nil,
"g2": nil,
}
# inclNoMatchStrict
If true (and inclNoMatch is true), instead of a single nil the group's values will be
a slice of empty string values explicitly matching the number of times the group name is specified
in the pattern.
May be unpredictable if the same name is used multiple times for different capture groups across multiple patterns.
For example, if a pattern:
^(?P<g1>foo)(?P<g1>bar)(?P<g2>baz)$
is provided but s does not match then matches will be:
map[string][]string{
"g1": []string{
"",
"",
},
"g2": []string{
"",
},
}
# mustMatch
If true, matches will be nil if the entirety of s does not match the pattern (and thus
no capture groups matched) (overrides inclNoMatch) -- explicitly:
matches == nil
Otherwise if false (and assuming inclNoMatch is false), matches will be:
map[string][]string{}{}
# Condition Tree
In detail, matches and/or its values may be nil or empty under the following condition tree:
IF s is empty:
THEN matches will always be nil
ELSE:
IF all of s does not match pattern
IF mustMuch is true
THEN matches == nil
ELSE
THEN matches == map[string][]string{} (non-nil but empty)
ELSE IF pattern has no named capture groups
IF inclNoMatch is true
THEN matches == map[string][]string{} (non-nil but empty)
ELSE
THEN matches == nil
ELSE
IF there are no named group matches
IF inclNoMatch is true
THEN matches is non-nil; matches[<group name>, ...] is/are defined but nil (_, ok = matches[<group name>]; ok == true)
ELSE
THEN matches == nil
ELSE
IF <group name> does not have a match
IF inclNoMatch is true
IF inclNoMatchStrict is true
THEN matches[<group name>] is defined and non-nil, but populated with placeholder strings
(matches[<group name>] == []string{""[, "", ...]})
ELSE
THEN matches[<group name>] is guaranteed defined but may be nil (_, ok = matches[<group name>]; ok == true)
ELSE
THEN matches[<group name>] is not defined (_, ok = matches[<group name>]; ok == false)
ELSE
matches[<group name>] == []{<match>[, <match>...]}
*/
func (r *ReMap) MapString(s string, inclNoMatch, inclNoMatchStrict, mustMatch bool) (matches map[string][]string) {
var ok bool
var endIdx int
var startIdx int
var grpIdx int
var grpNm string
var names []string
var matchStr string
var si stringIndexer
var matchIndices []int
var tmpMap map[string][]string = make(map[string][]string)
/*
OK so this is a bit of a deviation.
It's not as straightforward as above, because there isn't an explicit way
like above to determine if a pattern was *matched as an matched string* vs.
*not matched*.
So instead do roundabout index-y things.
*/
if s == "" {
return
}
/*
I'm not entirely sure how serious they are about
"the slice should not be modified"...
DO NOT sort or dedupe `names`! If the same name for groups is duplicated,
it will be duplicated here in proper order and the ordering is tied to
the ordering of matchIndices.
*/
names = r.Regexp.SubexpNames()[:]
matchIndices = r.Regexp.FindStringSubmatchIndex(s)
if matchIndices == nil {
// s does not match pattern at all.
if !mustMatch {
matches = make(map[string][]string)
}
return
}
if names == nil || len(names) == 0 || len(names) == 1 {
/*
No named capture groups;
technically only the last condition would be the case,
as (regexp.Regexp).SubexpNames() will ALWAYS at the LEAST
return a `[]string{""}`.
*/
if inclNoMatch {
matches = make(map[string][]string)
}
return
}
names = names[1:]
if len(matchIndices) == 0 || len(matchIndices) == 1 {
/*
No (sub)matches whatsoever.
*technically* I don't think this condition can actually be reached;
matchIndices should ALWAYS either be `nil` or len will be at LEAST 2,
and modulo 2 thereafter since they're PAIRS of indices...
Why they didn't just return a [][]int or [][2]int or something
instead of an []int, who knows.
But we're correcting that poor design.
This is more of a safe-return before we chunk the indices.
*/
matches = make(map[string][]string)
if inclNoMatch {
for _, grpNm = range names {
if grpNm == "" {
continue
}
matches[grpNm] = nil
}
}
return
}
/*
The reslice of `matchIndices` starts at 2 because they're in pairs:
[]int{<start>, <end>, <start>, <end>, ...}
and the first pair is the entire pattern match (un-resliced names[0],
un-resliced matchIndices[0]).
Thus the len(matchIndices) == 2*len(names) (*should*, that is), *even* if you reslice.
Keep in mind that since the first element of names is removed,
we reslice matchIndices as well.
*/
matchIndices = matchIndices[2:]
tmpMap = make(map[string][]string)
// Note that the second index is the *upper boundary*, not a *position in the string*
// so these indices are perfectly usable as-is as returned from the regexp methods.
// http://golang.org/ref/spec#Slice_expressions
for startIdx = 0; endIdx < len(matchIndices); startIdx += 2 {
endIdx = startIdx + 2
// This technically should never happen.
if endIdx > len(matchIndices) {
endIdx = len(matchIndices)
}
if grpIdx >= len(names) {
break
}
si = stringIndexer{
group: grpIdx,
start: matchIndices[startIdx],
end: matchIndices[endIdx-1],
matched: true,
nm: names[grpIdx],
grpS: "",
s: &matchStr,
ptrn: r.Regexp,
}
grpIdx++
if si.nm == "" {
// unnamed capture group
continue
}
// sets si.matched and si.grpS
si.idxSlice(&s)
if !si.matched {
if !inclNoMatch {
continue
}
if _, ok = tmpMap[si.nm]; !ok {
if !inclNoMatchStrict {
tmpMap[si.nm] = nil
} else {
tmpMap[si.nm] = []string{""}
}
} else {
if inclNoMatchStrict {
tmpMap[si.nm] = append(tmpMap[si.nm], "")
}
}
continue
}
if _, ok = tmpMap[si.nm]; !ok {
tmpMap[si.nm] = make([]string, 0)
}
tmpMap[si.nm] = append(tmpMap[si.nm], si.grpS)
}
// This *technically* should be completely handled above.
if inclNoMatch {
for _, grpNm = range names {
if grpNm == "" {
continue
}
if _, ok = tmpMap[grpNm]; !ok {
tmpMap[grpNm] = nil
}
}
}
if len(tmpMap) > 0 {
matches = tmpMap
}
return
}
/*
MapStringAll behaves exactly like [ReMap.MapString] but will "squash"/consolidate *all* found matches, not just the first occurrence,
into the group name.
You likely want to use this instead of [ReMap.MapString] for multiline patterns.
*/
func (r *ReMap) MapStringAll(s string, inclNoMatch, inclNoMatchStrict, mustMatch bool) (matches map[string][]string) {
var ok bool
var endIdx int
var startIdx int
var grpIdx int
var grpNm string
var names []string
var matchStr string
var si stringIndexer
var matchIndices []int
var allMatchIndices [][]int
var tmpMap map[string][]string = make(map[string][]string)
if s == "" {
return
}
names = r.Regexp.SubexpNames()[:]
allMatchIndices = r.Regexp.FindAllStringSubmatchIndex(s, -1)
if allMatchIndices == nil {
// s does not match pattern at all.
if !mustMatch {
matches = make(map[string][]string)
}
return
}
if names == nil || len(names) == 0 || len(names) == 1 {
/*
No named capture groups;
technically only the last condition would be the case,
as (regexp.Regexp).SubexpNames() will ALWAYS at the LEAST
return a `[]string{""}`.
*/
if inclNoMatch {
matches = make(map[string][]string)
}
return
}
names = names[1:]
if len(allMatchIndices) == 0 {
// No matches (and thus submatches) whatsoever.
// I think this is actually covered by the `if allMatchIndices == nil { ... }` above,
// but this is still here for safety and efficiency - early return on no matches to iterate.
matches = make(map[string][]string)
if inclNoMatch {
for _, grpNm = range names {
if grpNm == "" {
continue
}
matches[grpNm] = nil
}
}
return
}
// Do *NOT* trim/reslice allMatchIndices!
// The reslicing is done below, *inside* each matchIndices iteration!
tmpMap = make(map[string][]string)
// From here, it behaves (sort of) like ReMap.MapString.
// Build the strictly-paired chunk indexes and populate them.
// We are iterating over *match sets*; matchIndices here should be analgous
// to matchIndices in ReMap.MapString.
for _, matchIndices = range allMatchIndices {
if matchIndices == nil {
// I *think* the exception with the *All* variant here
// is the *entire* return (allMatchIndices) is nil if there
// aren't any matches; I can't imagine there'd be any feasible
// way it'd insert a nil *element* for an index mapping group.
// So just continuing here should be fine;
// this continue SHOULD be unreachable.
continue
}
// Reslice *here*, on the particular match index group.
// Grap the matchStr first; it's not currently *used* by anything but may in the future.
matchStr, ok = strIdxSlicer(
s,
*(*[2]int)(matchIndices[0:2]),
)
if len(matchIndices) == 0 || len(matchIndices) == 1 {
// No *sub*matches (capture groups) in this match, but it still matched the pattern.
if inclNoMatch {
for _, grpNm = range names {
if grpNm == "" {
continue
}
// We don't immediately return, though; we just stage out group names just in case.
// That's why we use tmpMap and not matches.
if _, ok = tmpMap[grpNm]; !ok {
tmpMap[grpNm] = nil
}
}
}
continue
}
matchIndices = matchIndices[2:]
// Reset from previous loop
endIdx = 0
grpIdx = 0
for startIdx = 0; endIdx < len(matchIndices); startIdx += 2 {
endIdx = startIdx + 2
if endIdx > len(matchIndices) {
endIdx = len(matchIndices)
}
if grpIdx >= len(names) {
break
}
si = stringIndexer{
group: grpIdx,
start: matchIndices[startIdx],
end: matchIndices[endIdx-1],
matched: true,
nm: names[grpIdx],
grpS: "",
ptrn: r.Regexp,
}
grpIdx++
// We do not include the entire match string here;
// we don't need it for this. Waste of memory.
_ = matchStr
/*
si.s = new(string)
*si.s = matchStr
*/
if si.nm == "" {
// unnamed capture group
continue
}
// sets si.matched and si.grpS
si.idxSlice(&s)
if !si.matched {
if !inclNoMatch {
continue
}
if _, ok = tmpMap[si.nm]; !ok {
if !inclNoMatchStrict {
tmpMap[si.nm] = nil
} else {
tmpMap[si.nm] = []string{""}
}
} else {
if inclNoMatchStrict {
tmpMap[si.nm] = append(tmpMap[si.nm], "")
}
}
continue
}
if _, ok = tmpMap[si.nm]; !ok {
tmpMap[si.nm] = make([]string, 0)
}
tmpMap[si.nm] = append(tmpMap[si.nm], si.grpS)
}
}
if inclNoMatch {
for _, grpNm = range names {
if grpNm == "" {
continue
}
if _, ok = tmpMap[grpNm]; !ok {
tmpMap[grpNm] = nil
}
}
}
if len(tmpMap) > 0 {
matches = tmpMap
}
return
}