View Javadoc
1   package org.metricshub.jawk.jrt;
2   
3   /*-
4    * ╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲
5    * Jawk
6    * ჻჻჻჻჻჻
7    * Copyright (C) 2006 - 2025 MetricsHub
8    * ჻჻჻჻჻჻
9    * This program is free software: you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser General Public License as
11   * published by the Free Software Foundation, either version 3 of the
12   * License, or (at your option) any later version.
13   *
14   * This program is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU General Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU General Lesser Public
20   * License along with this program.  If not, see
21   * <http://www.gnu.org/licenses/lgpl-3.0.html>.
22   * ╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱
23   */
24  
25  import java.io.FilterReader;
26  import java.io.IOException;
27  import java.io.Reader;
28  import java.util.regex.Matcher;
29  import java.util.regex.Pattern;
30  
31  /**
32   * A reader which consumes one record at a time from
33   * an underlying input reader.
34   * <h2>Greedy Regex Matching</h2>
35   * The current implementation matches setRecordSeparator against
36   * contents of an input buffer (the underlying input
37   * stream filling the input buffer). Records are
38   * split against the matched regular expression
39   * input, treating the regular expression as a
40   * record separator.
41   * <p>
42   * By default, greedy regular expression matching
43   * for setRecordSeparator is turned off. It is assumed
44   * the user will employ a non-ambiguous regex for setRecordSeparator.
45   * For example, ab*c is a non-ambiguous regex,
46   * but ab?c?b is an ambiguous regex because
47   * it can match ab or abc, and the reader may
48   * accept either one, depending on input buffer boundaries.
49   * The implemented way to employ greedy regex matching
50   * is to consume subsequent input until the match
51   * does not occur at the end of the input buffer,
52   * or no input is available. However, this behavior
53   * is not desirable in all cases (i.e., interactive
54   * input against some sort of ambiguous newline
55   * regex). To enable greedy setRecordSeparator regex consumption,
56   * use <code>-Djawk.forceGreedyRS=true</code>.
57   *
58   * @author Danny Daglas
59   */
60  public class PartitioningReader extends FilterReader {
61  
62  	private static final boolean FORCE_GREEDY_RS;
63  
64  	static {
65  		String grs = System.getProperty("jawk.forceGreedyRS", "0").trim();
66  		FORCE_GREEDY_RS = grs.equals("1") || grs.equalsIgnoreCase("yes") || grs.equalsIgnoreCase("true");
67  	}
68  
69  	private Pattern rs;
70  	private Matcher matcher;
71  	private boolean fromFileNameList;
72  
73  	/**
74  	 * Construct the partitioning reader.
75  	 *
76  	 * @param reader The reader containing the input data stream.
77  	 * @param recordSeparator The record separator, as a regular expression.
78  	 */
79  	public PartitioningReader(Reader reader, String recordSeparator) {
80  		this(reader, recordSeparator, false);
81  	}
82  
83  	/**
84  	 * Construct the partitioning reader.
85  	 *
86  	 * @param r The reader containing the input data stream.
87  	 * @param recordSeparator The record separator, as a regular expression.
88  	 * @param fromFileNameList Whether the underlying input reader
89  	 *        is a file from the filename list (the parameters passed
90  	 *        into AWK after the script argument).
91  	 */
92  	public PartitioningReader(Reader r, String recordSeparator, boolean fromFileNameList) {
93  		super(r);
94  		this.fromFileNameList = fromFileNameList;
95  		setRecordSeparator(recordSeparator);
96  	}
97  
98  	private String recordSeparator = null;
99  	private boolean consumeAll = false;
100 
101 	/**
102 	 * Assign a new record separator for this partitioning reader.
103 	 *
104 	 * @param recordSeparator The new record separator, as a regular expression.
105 	 */
106 	public final void setRecordSeparator(String recordSeparator) {
107 		if (!recordSeparator.equals(this.recordSeparator)) {
108 			if ("".equals(recordSeparator)) {
109 				consumeAll = true;
110 				rs = Pattern.compile("\\z", Pattern.DOTALL | Pattern.MULTILINE);
111 			} else if ("\n".equals(recordSeparator) || "\r\n".equals(recordSeparator) || "\r".equals(recordSeparator)) {
112 				// For performance reason, handle the default RS in a specific way here
113 				consumeAll = false;
114 				rs = Pattern.compile(recordSeparator, Pattern.LITERAL);
115 			} else {
116 				consumeAll = false;
117 				rs = Pattern.compile(recordSeparator, Pattern.DOTALL | Pattern.MULTILINE);
118 			}
119 			this.recordSeparator = recordSeparator;
120 		}
121 	}
122 
123 	/**
124 	 * <p>
125 	 * fromFilenameList.
126 	 * </p>
127 	 *
128 	 * @return true whether the underlying input reader is from a
129 	 *         filename list argument; false otherwise
130 	 */
131 	public boolean fromFilenameList() {
132 		return fromFileNameList;
133 	}
134 
135 	private StringBuilder remaining = new StringBuilder();
136 	private char[] readBuffer = new char[4096];
137 
138 	/** {@inheritDoc} */
139 	@Override
140 	public int read(char[] b, int start, int len) throws IOException {
141 		int readChars = super.read(b, start, len);
142 		if (readChars >= 0) {
143 			remaining.append(b, start, readChars);
144 		}
145 		return readChars;
146 	}
147 
148 	private boolean eof = false;
149 
150 	/**
151 	 * Consume one record from the reader.
152 	 * It uses the record separator regular
153 	 * expression to mark start/end of records.
154 	 *
155 	 * @return the next record, null if no more records exist
156 	 * @throws java.io.IOException upon an IO error
157 	 */
158 	public String readRecord() throws IOException {
159 		if (matcher == null) {
160 			matcher = rs.matcher(remaining);
161 		} else {
162 			matcher.reset(remaining);
163 		}
164 
165 		while (consumeAll || eof || remaining.length() == 0 || !matcher.find()) {
166 			int len = read(readBuffer, 0, readBuffer.length);
167 			if (eof || (len < 0)) {
168 				eof = true;
169 				String retVal = remaining.toString();
170 				remaining.setLength(0);
171 				if (retVal.length() == 0) {
172 					return null;
173 				} else {
174 					return retVal;
175 				}
176 			} else if (len == 0) {
177 				throw new RuntimeException("len == 0 ?!");
178 			}
179 			matcher = rs.matcher(remaining);
180 		}
181 
182 		// if force greedy regex consumption:
183 		if (FORCE_GREEDY_RS) {
184 			// attempt to move last match away from the end of the input
185 			// so that buffer bounderies landing in the middle of
186 			// regexp matches that *could* match the regexp if more chars
187 			// were read
188 			// (one char at a time!)
189 			while (matcher.find() && matcher.end() == remaining.length() && matcher.requireEnd()) {
190 				if (read(readBuffer, 0, 1) >= 0) {
191 					matcher = rs.matcher(remaining);
192 				} else {
193 					break;
194 				}
195 			}
196 		}
197 
198 		// we have a record separator!
199 
200 		String retVal = remaining.substring(0, matcher.start());
201 		remaining.delete(0, matcher.end());
202 		return retVal;
203 	}
204 }