Skip to content

Commit 44a4ad2

Browse files
authored
Adding flatfile.fixedlength.EnvelopeDecl and its ColumnDecl (#166)
ColumnDecl is mostly similar to the existing fixed-length implementation (https://github.com/jf-tech/omniparser/blob/d8e230aa07673b9ce7b933419c4d528b1796e56f/extensions/omniv21/fileformat/fixedlength/decl.go#L17) EnvelopeDecl is mostly different, going after the new flatfile.RecDecl model with Type/Target/Min/Max/etc introduced. However, no panic. With the new decl, the final resulting schema (to be coming soon) will by and large be (very very) similar to existing fixed length schemas, with only a couple of minor/trivial tweaks for existing complex use cases and with no changes at all for simple/common use cases.
1 parent d8e230a commit 44a4ad2

File tree

2 files changed

+269
-0
lines changed

2 files changed

+269
-0
lines changed
Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
package fixedlength
2+
3+
import (
4+
"fmt"
5+
"regexp"
6+
"unicode/utf8"
7+
8+
"github.com/jf-tech/go-corelib/maths"
9+
10+
"github.com/jf-tech/omniparser/extensions/omniv21/fileformat/flatfile"
11+
)
12+
13+
// ColumnDecl describes a column of an envelope.
14+
type ColumnDecl struct {
15+
Name string `json:"name,omitempty"`
16+
StartPos int `json:"start_pos,omitempty"` // 1-based. and rune-based.
17+
Length int `json:"length,omitempty"` // rune-based length.
18+
LinePattern *string `json:"line_pattern,omitempty"`
19+
20+
linePatternRegexp *regexp.Regexp
21+
}
22+
23+
func (c *ColumnDecl) lineMatch(line []byte) bool {
24+
if c.linePatternRegexp == nil {
25+
return true
26+
}
27+
return c.linePatternRegexp.Match(line)
28+
}
29+
30+
func (c *ColumnDecl) lineToColumnValue(line []byte) string {
31+
// StartPos is 1-based and its value >= 1 guaranteed by json schema validation done earlier.
32+
start := c.StartPos - 1
33+
// First chop off the prefix prior to c.StartPos
34+
for start > 0 && len(line) > 0 {
35+
_, adv := utf8.DecodeRune(line)
36+
line = line[adv:]
37+
start--
38+
}
39+
// Then from that position, count c.Length runes and that's the string value we need.
40+
// Note if c.Length is longer than what's left in the line, we'll simply take all of
41+
// the remaining line (and no error here, since we haven't yet seen a useful case where
42+
// we need to be excessively strict.)
43+
lenCount := c.Length
44+
i := 0
45+
for lenCount > 0 && i < len(line) {
46+
_, adv := utf8.DecodeRune(line[i:])
47+
i += adv
48+
lenCount--
49+
}
50+
return string(line[:i])
51+
}
52+
53+
const (
54+
typeEnvelope = "envelope"
55+
typeGroup = "envelope_group"
56+
)
57+
58+
// EnvelopeDecl describes an envelope of a fixed-length input.
59+
// If Rows/Header/Footer are all nil, then it defaults to Rows = 1.
60+
// If Rows specified, then Header/Footer must be nil. (JSON schema validation will ensure this.)
61+
// If Header is specified, Rows must be nil. (JSON schema validation will ensure this.)
62+
// Footer is optional; If not specified, Header will be used for a single-line envelope matching.
63+
type EnvelopeDecl struct {
64+
Name string `json:"name,omitempty"`
65+
Rows *int `json:"rows,omitempty"`
66+
Header *string `json:"header,omitempty"`
67+
Footer *string `json:"footer,omitempty"`
68+
Type *string `json:"type,omitempty"`
69+
IsTarget bool `json:"is_target,omitempty"`
70+
Min *int `json:"min,omitempty"`
71+
Max *int `json:"max,omitempty"`
72+
Columns []*ColumnDecl `json:"columns,omitempty"`
73+
Children []*EnvelopeDecl `json:"child_envelopes,omitempty"`
74+
75+
fqdn string // fullly hierarchical name to the envelope.
76+
childRecDecls []flatfile.RecDecl
77+
headerRegexp *regexp.Regexp
78+
footerRegexp *regexp.Regexp
79+
}
80+
81+
func (e *EnvelopeDecl) DeclName() string {
82+
return e.Name
83+
}
84+
85+
func (e *EnvelopeDecl) Target() bool {
86+
return e.IsTarget
87+
}
88+
89+
func (e *EnvelopeDecl) Group() bool {
90+
return e.Type != nil && *e.Type == typeGroup
91+
}
92+
93+
// MinOccurs defaults to 0. Fixed-length input most common scenario is min=0/max=unbounded.
94+
func (e *EnvelopeDecl) MinOccurs() int {
95+
switch e.Min {
96+
case nil:
97+
return 0
98+
default:
99+
return *e.Min
100+
}
101+
}
102+
103+
// MaxOccurs defaults to unbounded. Fixed-length input most common scenario is min=0/max=unbounded.
104+
func (e *EnvelopeDecl) MaxOccurs() int {
105+
switch {
106+
case e.Max == nil:
107+
fallthrough
108+
case *e.Max < 0:
109+
return maths.MaxIntValue
110+
default:
111+
return *e.Max
112+
}
113+
}
114+
115+
func (e *EnvelopeDecl) ChildDecls() []flatfile.RecDecl {
116+
return e.childRecDecls
117+
}
118+
119+
func (e *EnvelopeDecl) rowsBased() bool {
120+
// for header/footer based envelope, header must be specified; otherwise, it's rows based.
121+
return e.Header == nil
122+
}
123+
124+
// rows() defaults to 1. Fixed-length input most common scenario is rows-based single line envelope.
125+
func (e *EnvelopeDecl) rows() int {
126+
if !e.rowsBased() {
127+
panic(fmt.Sprintf("envelope '%s' is not rows based", e.fqdn))
128+
}
129+
if e.Rows == nil {
130+
return 1
131+
}
132+
return *e.Rows
133+
}
134+
135+
func (e *EnvelopeDecl) matchHeader(line []byte) bool {
136+
if e.headerRegexp == nil {
137+
panic(fmt.Sprintf("envelope '%s' is not header/footer based", e.fqdn))
138+
}
139+
return e.headerRegexp.Match(line)
140+
}
141+
142+
// Footer is optional. If not specified, it always matches. Thus for a header/footer envelope,
143+
// if the footer isn't specified, it effectively becomes a single-row envelope matched by header,
144+
// given that after the header matches a line, matchFooter is called on the same line.
145+
func (e *EnvelopeDecl) matchFooter(line []byte) bool {
146+
if e.footerRegexp == nil {
147+
return true
148+
}
149+
return e.footerRegexp.Match(line)
150+
}
151+
152+
func toFlatFileRecDecls(es []*EnvelopeDecl) []flatfile.RecDecl {
153+
if len(es) == 0 {
154+
return nil
155+
}
156+
ret := make([]flatfile.RecDecl, len(es))
157+
for i, d := range es {
158+
ret[i] = d
159+
}
160+
return ret
161+
}
162+
163+
// FileDecl describes fixed-length schema `file_declaration` setting.
164+
type FileDecl struct {
165+
Envelopes []*EnvelopeDecl `json:"envelopes,omitempty"`
166+
}
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
package fixedlength
2+
3+
import (
4+
"regexp"
5+
"testing"
6+
7+
"github.com/jf-tech/go-corelib/maths"
8+
"github.com/jf-tech/go-corelib/strs"
9+
"github.com/jf-tech/go-corelib/testlib"
10+
"github.com/jf-tech/omniparser/extensions/omniv21/fileformat/flatfile"
11+
"github.com/stretchr/testify/assert"
12+
)
13+
14+
func TestColumnDecl_LineMatch(t *testing.T) {
15+
assert.True(t, (&ColumnDecl{}).lineMatch([]byte("test")))
16+
assert.False(t, (&ColumnDecl{linePatternRegexp: regexp.MustCompile("^ABC.*$")}).
17+
lineMatch([]byte("test")))
18+
assert.True(t, (&ColumnDecl{linePatternRegexp: regexp.MustCompile("^ABC.*$")}).
19+
lineMatch([]byte("ABCDEFG")))
20+
}
21+
22+
func TestColumnDecl_LineToColumnValue(t *testing.T) {
23+
decl := func(start, length int) *ColumnDecl {
24+
return &ColumnDecl{StartPos: start, Length: length}
25+
}
26+
assert.Equal(t, "", decl(10, 4).lineToColumnValue([]byte("test"))) // fully out of range
27+
assert.Equal(t, "st", decl(3, 4).lineToColumnValue([]byte("test"))) // partially out of range
28+
assert.Equal(t, "tes", decl(1, 3).lineToColumnValue([]byte("test"))) // fully in range
29+
}
30+
31+
func TestEnvelopeDecl(t *testing.T) {
32+
// DeclName()
33+
e := &EnvelopeDecl{Name: "e1"}
34+
assert.Equal(t, "e1", e.DeclName())
35+
e.fqdn = e.DeclName()
36+
37+
// Target()
38+
assert.False(t, e.Target())
39+
e.IsTarget = true
40+
assert.True(t, e.Target())
41+
42+
// Group()
43+
assert.False(t, e.Group())
44+
e.Type = strs.StrPtr(typeEnvelope)
45+
assert.False(t, e.Group())
46+
e.Type = strs.StrPtr(typeGroup)
47+
assert.True(t, e.Group())
48+
49+
// MinOccurs()
50+
assert.Equal(t, 0, e.MinOccurs())
51+
e.Min = testlib.IntPtr(42)
52+
assert.Equal(t, 42, e.MinOccurs())
53+
54+
// MaxOccurs()
55+
assert.Equal(t, maths.MaxIntValue, e.MaxOccurs())
56+
e.Max = testlib.IntPtr(-1)
57+
assert.Equal(t, maths.MaxIntValue, e.MaxOccurs())
58+
e.Max = testlib.IntPtr(42)
59+
assert.Equal(t, 42, e.MaxOccurs())
60+
61+
// ChildDecls()
62+
assert.Nil(t, e.ChildDecls())
63+
e.childRecDecls = []flatfile.RecDecl{}
64+
assert.Equal(t, e.childRecDecls, e.ChildDecls())
65+
66+
// rowsBased()
67+
assert.True(t, e.rowsBased())
68+
e.Header = strs.StrPtr("^ABC$")
69+
assert.False(t, e.rowsBased())
70+
71+
// rows()
72+
assert.PanicsWithValue(t, "envelope 'e1' is not rows based", func() { e.rows() })
73+
e.Header = nil
74+
assert.Equal(t, 1, e.rows())
75+
e.Rows = testlib.IntPtr(42)
76+
assert.Equal(t, 42, e.rows())
77+
78+
// matchHeader()
79+
assert.PanicsWithValue(
80+
t, "envelope 'e1' is not header/footer based", func() { e.matchHeader(nil) })
81+
e.headerRegexp = regexp.MustCompile("^ABC$")
82+
assert.False(t, e.matchHeader([]byte("ABCD")))
83+
assert.True(t, e.matchHeader([]byte("ABC")))
84+
85+
// matchFooter()
86+
assert.True(t, e.matchFooter([]byte("ABCD")))
87+
e.footerRegexp = regexp.MustCompile("^ABC$")
88+
assert.False(t, e.matchFooter([]byte("ABCD")))
89+
assert.True(t, e.matchFooter([]byte("ABC")))
90+
}
91+
92+
func TestToFlatFileRecDecls(t *testing.T) {
93+
assert.Nil(t, toFlatFileRecDecls(nil))
94+
assert.Nil(t, toFlatFileRecDecls([]*EnvelopeDecl{}))
95+
es := []*EnvelopeDecl{
96+
{},
97+
{},
98+
}
99+
ds := toFlatFileRecDecls(es)
100+
for i := range ds {
101+
assert.Same(t, es[i], ds[i].(*EnvelopeDecl))
102+
}
103+
}

0 commit comments

Comments
 (0)