1 package org.metricshub.jawk.jrt;
2
3 /*-
4 * ╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲
5 * Jawk
6 * ჻჻჻჻჻჻
7 * Copyright (C) 2006 - 2025 MetricsHub
8 * ჻჻჻჻჻჻
9 * This program is free software: you can redistribute it and/or modify
10 * it under the terms of the GNU Lesser General Public License as
11 * published by the Free Software Foundation, either version 3 of the
12 * License, or (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Lesser Public License for more details.
18 *
19 * You should have received a copy of the GNU General Lesser Public
20 * License along with this program. If not, see
21 * <http://www.gnu.org/licenses/lgpl-3.0.html>.
22 * ╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱
23 */
24
25 import java.io.FilterReader;
26 import java.io.IOException;
27 import java.io.Reader;
28 import java.util.regex.Matcher;
29 import java.util.regex.Pattern;
30
31 /**
32 * A reader which consumes one record at a time from
33 * an underlying input reader.
34 * <h2>Greedy Regex Matching</h2>
35 * The current implementation matches setRecordSeparator against
36 * contents of an input buffer (the underlying input
37 * stream filling the input buffer). Records are
38 * split against the matched regular expression
39 * input, treating the regular expression as a
40 * record separator.
41 * <p>
42 * By default, greedy regular expression matching
43 * for setRecordSeparator is turned off. It is assumed
44 * the user will employ a non-ambiguous regex for setRecordSeparator.
45 * For example, ab*c is a non-ambiguous regex,
46 * but ab?c?b is an ambiguous regex because
47 * it can match ab or abc, and the reader may
48 * accept either one, depending on input buffer boundaries.
49 * The implemented way to employ greedy regex matching
50 * is to consume subsequent input until the match
51 * does not occur at the end of the input buffer,
52 * or no input is available. However, this behavior
53 * is not desirable in all cases (i.e., interactive
54 * input against some sort of ambiguous newline
55 * regex). To enable greedy setRecordSeparator regex consumption,
56 * use <code>-Djawk.forceGreedyRS=true</code>.
57 *
58 * @author Danny Daglas
59 */
60 public class PartitioningReader extends FilterReader {
61
62 private static final boolean FORCE_GREEDY_RS;
63
64 static {
65 String grs = System.getProperty("jawk.forceGreedyRS", "0").trim();
66 FORCE_GREEDY_RS = grs.equals("1") || grs.equalsIgnoreCase("yes") || grs.equalsIgnoreCase("true");
67 }
68
69 private Pattern rs;
70 private Matcher matcher;
71 private boolean fromFileNameList;
72
73 /**
74 * Construct the partitioning reader.
75 *
76 * @param reader The reader containing the input data stream.
77 * @param recordSeparator The record separator, as a regular expression.
78 */
79 public PartitioningReader(Reader reader, String recordSeparator) {
80 this(reader, recordSeparator, false);
81 }
82
83 /**
84 * Construct the partitioning reader.
85 *
86 * @param r The reader containing the input data stream.
87 * @param recordSeparator The record separator, as a regular expression.
88 * @param fromFileNameList Whether the underlying input reader
89 * is a file from the filename list (the parameters passed
90 * into AWK after the script argument).
91 */
92 public PartitioningReader(Reader r, String recordSeparator, boolean fromFileNameList) {
93 super(r);
94 this.fromFileNameList = fromFileNameList;
95 setRecordSeparator(recordSeparator);
96 }
97
98 private String recordSeparator = null;
99 private boolean consumeAll = false;
100
101 /**
102 * Assign a new record separator for this partitioning reader.
103 *
104 * @param recordSeparator The new record separator, as a regular expression.
105 */
106 public final void setRecordSeparator(String recordSeparator) {
107 if (!recordSeparator.equals(this.recordSeparator)) {
108 if ("".equals(recordSeparator)) {
109 consumeAll = true;
110 rs = Pattern.compile("\\z", Pattern.DOTALL | Pattern.MULTILINE);
111 } else if ("\n".equals(recordSeparator) || "\r\n".equals(recordSeparator) || "\r".equals(recordSeparator)) {
112 // For performance reason, handle the default RS in a specific way here
113 consumeAll = false;
114 rs = Pattern.compile(recordSeparator, Pattern.LITERAL);
115 } else {
116 consumeAll = false;
117 rs = Pattern.compile(recordSeparator, Pattern.DOTALL | Pattern.MULTILINE);
118 }
119 this.recordSeparator = recordSeparator;
120 }
121 }
122
123 /**
124 * <p>
125 * fromFilenameList.
126 * </p>
127 *
128 * @return true whether the underlying input reader is from a
129 * filename list argument; false otherwise
130 */
131 public boolean fromFilenameList() {
132 return fromFileNameList;
133 }
134
135 private StringBuilder remaining = new StringBuilder();
136 private char[] readBuffer = new char[4096];
137
138 /** {@inheritDoc} */
139 @Override
140 public int read(char[] b, int start, int len) throws IOException {
141 int readChars = super.read(b, start, len);
142 if (readChars >= 0) {
143 remaining.append(b, start, readChars);
144 }
145 return readChars;
146 }
147
148 private boolean eof = false;
149
150 /**
151 * Consume one record from the reader.
152 * It uses the record separator regular
153 * expression to mark start/end of records.
154 *
155 * @return the next record, null if no more records exist
156 * @throws java.io.IOException upon an IO error
157 */
158 public String readRecord() throws IOException {
159 if (matcher == null) {
160 matcher = rs.matcher(remaining);
161 } else {
162 matcher.reset(remaining);
163 }
164
165 while (consumeAll || eof || remaining.length() == 0 || !matcher.find()) {
166 int len = read(readBuffer, 0, readBuffer.length);
167 if (eof || (len < 0)) {
168 eof = true;
169 String retVal = remaining.toString();
170 remaining.setLength(0);
171 if (retVal.length() == 0) {
172 return null;
173 } else {
174 return retVal;
175 }
176 } else if (len == 0) {
177 throw new RuntimeException("len == 0 ?!");
178 }
179 matcher = rs.matcher(remaining);
180 }
181
182 // if force greedy regex consumption:
183 if (FORCE_GREEDY_RS) {
184 // attempt to move last match away from the end of the input
185 // so that buffer bounderies landing in the middle of
186 // regexp matches that *could* match the regexp if more chars
187 // were read
188 // (one char at a time!)
189 while (matcher.find() && matcher.end() == remaining.length() && matcher.requireEnd()) {
190 if (read(readBuffer, 0, 1) >= 0) {
191 matcher = rs.matcher(remaining);
192 } else {
193 break;
194 }
195 }
196 }
197
198 // we have a record separator!
199
200 String retVal = remaining.substring(0, matcher.start());
201 remaining.delete(0, matcher.end());
202 return retVal;
203 }
204 }