1 package io.jawk.jrt;
2
3 /*-
4 * ╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲
5 * Jawk
6 * ჻჻჻჻჻჻
7 * Copyright (C) 2006 - 2026 MetricsHub
8 * ჻჻჻჻჻჻
9 * This program is free software: you can redistribute it and/or modify
10 * it under the terms of the GNU Lesser General Public License as
11 * published by the Free Software Foundation, either version 3 of the
12 * License, or (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Lesser Public License for more details.
18 *
19 * You should have received a copy of the GNU General Lesser Public
20 * License along with this program. If not, see
21 * <http://www.gnu.org/licenses/lgpl-3.0.html>.
22 * ╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱
23 */
24
25 import java.io.FilterReader;
26 import java.io.IOException;
27 import java.io.Reader;
28 import java.util.regex.Matcher;
29 import java.util.regex.Pattern;
30
31 /**
32 * A reader which consumes one record at a time from
33 * an underlying input reader.
34 * <h2>Greedy Regex Matching</h2>
35 * The current implementation matches setRecordSeparator against
36 * contents of an input buffer (the underlying input
37 * stream filling the input buffer). Records are
38 * split against the matched regular expression
39 * input, treating the regular expression as a
40 * record separator.
41 * <p>
42 * By default, greedy regular expression matching
43 * for setRecordSeparator is turned off. It is assumed
44 * the user will employ a non-ambiguous regex for setRecordSeparator.
45 * For example, ab*c is a non-ambiguous regex,
46 * but ab?c?b is an ambiguous regex because
47 * it can match ab or abc, and the reader may
48 * accept either one, depending on input buffer boundaries.
49 * The implemented way to employ greedy regex matching
50 * is to consume subsequent input until the match
51 * does not occur at the end of the input buffer,
52 * or no input is available. However, this behavior
53 * is not desirable in all cases (i.e., interactive
54 * input against some sort of ambiguous newline
55 * regex). To enable greedy setRecordSeparator regex consumption,
56 * use <code>-Djawk.forceGreedyRS=true</code>.
57 *
58 * @author Danny Daglas
59 */
60 public class PartitioningReader extends FilterReader {
61
62 private static final boolean FORCE_GREEDY_RS;
63
64 static {
65 String grs = System.getProperty("jawk.forceGreedyRS", "0").trim();
66 FORCE_GREEDY_RS = grs.equals("1") || grs.equalsIgnoreCase("yes") || grs.equalsIgnoreCase("true");
67 }
68
69 private Pattern rs;
70 private Matcher matcher;
71 private boolean fromFileNameList;
72
73 /**
74 * Construct the partitioning reader.
75 *
76 * @param reader The reader containing the input data stream.
77 * @param recordSeparator The record separator, as a regular expression.
78 */
79 public PartitioningReader(Reader reader, String recordSeparator) {
80 this(reader, recordSeparator, false);
81 }
82
83 /**
84 * Construct the partitioning reader.
85 *
86 * @param r The reader containing the input data stream.
87 * @param recordSeparator The record separator, as a regular expression.
88 * @param fromFileNameList Whether the underlying input reader
89 * is a file from the filename list (the parameters passed
90 * into AWK after the script argument).
91 */
92 public PartitioningReader(Reader r, String recordSeparator, boolean fromFileNameList) {
93 super(r);
94 this.fromFileNameList = fromFileNameList;
95 setRecordSeparator(recordSeparator);
96 }
97
98 private String recordSeparator = null;
99 private boolean consumeAll = false;
100
101 /**
102 * Assign a new record separator for this partitioning reader.
103 *
104 * @param recordSeparator The new record separator, as a regular expression.
105 */
106 public final void setRecordSeparator(String recordSeparator) {
107 if (!recordSeparator.equals(this.recordSeparator)) {
108 if ("".equals(recordSeparator)) {
109 consumeAll = true;
110 rs = Pattern.compile("\\z", Pattern.DOTALL | Pattern.MULTILINE);
111 } else if ("\n".equals(recordSeparator)) {
112 // Match \r?\n so that CRLF inputs produce clean records without trailing \r
113 consumeAll = false;
114 rs = Pattern.compile("\\r?\\n");
115 } else if ("\r\n".equals(recordSeparator) || "\r".equals(recordSeparator)) {
116 consumeAll = false;
117 rs = Pattern.compile(recordSeparator, Pattern.LITERAL);
118 } else {
119 consumeAll = false;
120 rs = Pattern.compile(recordSeparator, Pattern.DOTALL | Pattern.MULTILINE);
121 }
122 this.recordSeparator = recordSeparator;
123 }
124 }
125
126 /**
127 * <p>
128 * fromFilenameList.
129 * </p>
130 *
131 * @return true whether the underlying input reader is from a
132 * filename list argument; false otherwise
133 */
134 public boolean fromFilenameList() {
135 return fromFileNameList;
136 }
137
138 private StringBuilder remaining = new StringBuilder();
139 private char[] readBuffer = new char[4096];
140
141 /** {@inheritDoc} */
142 @Override
143 public int read(char[] b, int start, int len) throws IOException {
144 int readChars = super.read(b, start, len);
145 if (readChars >= 0) {
146 remaining.append(b, start, readChars);
147 }
148 return readChars;
149 }
150
151 private boolean eof = false;
152
153 /**
154 * Consume one record from the reader.
155 * It uses the record separator regular
156 * expression to mark start/end of records.
157 *
158 * @return the next record, null if no more records exist
159 * @throws java.io.IOException upon an IO error
160 */
161 public String readRecord() throws IOException {
162 while (consumeAll || eof || remaining.length() == 0 || !findRecordSeparator()) {
163 int len = read(readBuffer, 0, readBuffer.length);
164 if (eof || (len < 0)) {
165 eof = true;
166 String retVal = remaining.toString();
167 remaining.setLength(0);
168 if (retVal.length() == 0) {
169 return null;
170 } else {
171 return retVal;
172 }
173 } else if (len == 0) {
174 throw new RuntimeException("len == 0 ?!");
175 }
176 }
177
178 // if force greedy regex consumption:
179 if (FORCE_GREEDY_RS) {
180 // attempt to move last match away from the end of the input
181 // so that buffer bounderies landing in the middle of
182 // regexp matches that *could* match the regexp if more chars
183 // were read
184 // (one char at a time!)
185 while (matcher.find() && matcher.end() == remaining.length() && matcher.requireEnd()) {
186 if (read(readBuffer, 0, 1) >= 0) {
187 if (!findRecordSeparator()) {
188 break;
189 }
190 } else {
191 break;
192 }
193 }
194 }
195
196 // we have a record separator!
197
198 String retVal = remaining.substring(0, matcher.start());
199 remaining.delete(0, matcher.end());
200 return retVal;
201 }
202
203 /**
204 * Finds the next record separator match that consumes at least one
205 * character. Zero-length regular-expression matches are ignored because
206 * using them as separators would not advance the reader and would loop
207 * forever on nullable regexes such as {@code ()}.
208 *
209 * @return {@code true} when a non-empty record separator match was found
210 */
211 private boolean findRecordSeparator() {
212 if (matcher == null) {
213 matcher = rs.matcher(remaining);
214 } else {
215 matcher.reset(remaining);
216 }
217 while (matcher.find()) {
218 if (matcher.start() != matcher.end()) {
219 return true;
220 }
221 }
222 return false;
223 }
224 }