001/*
002Copyright (c) 2022-2023 Steve Shering
003
004All rights reserved.
005
006As a special exception, the copyright holder of this software gives you permission
007to use this software for personal, not-for-profit purposes.
008
009For any other purpose, a license must be obtained from the copyright holder.
010
011This copyright notice and this permission notice must be included in all copies 
012of this software, including copies of parts of this software.
013
014THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
015IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
016FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
017AUTHOR OR COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
018LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
019OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
020THE SOFTWARE.
021*/
022package net.sherst.io;
023
024import java.io.IOException;
025import java.io.Reader;
026
027/**
028 * Wrapper for any {@link java.io.Reader}
029 * which provides look-ahead methods ({@code at}, {@code atSkip}, {@code atWordSkip}, {@code peek})
030 * and some useful utility methods ({@link #readChar}, {@link #atEOF}, {@link #skipWhitespaceAndComments()}).
031 * <p>
032 * Makes most parsing tasks easy and efficient,
033 * including parsing without a separate lexer 
034 * and parsing nested grammars.
035 * <p>
036 * The capacity is the maximum lookahead (in number of characters).
037 * This must be specified when the {@link LookAheadReader} is created.
038 * {@link LookAheadReader} uses a fast but fixed-size ring buffer.
039 * <p>
040 * The intended use is parsing source code from start to finish in one thread. Not thread safe.
041 * 
042 * @author sherstDotNet@yahoo.com
043 */
044public class LookAheadReader extends Reader {
045 private char[] arr;
046 private int capacity=0;
047 private boolean eof=false;
048 private int head=0;
049 private int mark=-1;
050 private int count=0;
051 private Reader reader;
052 private int tail=0;
053 
054 static public boolean isDecimalDigit(char c) {
055    return (c>='0') && (c<='9');
056    }
057 
058 static public boolean isIdentifierChar(char c) {
059    return isIdentifierStart(c) || isDecimalDigit(c);
060    }
061 
062 static public boolean isIdentifierStart(char c) {
063    return ((c>='A')&&(c<='Z')) || (c=='_')||((c>='a')&&(c<='z')||(c=='-'));
064    }
065 
066 public static boolean isWhitespace(char c) {
067    return (c==' ')||(c=='\n')||(c=='\r')||(c=='\t')||(c==65279);
068    }
069
070 /**
071  * Creates a new {@link LookAheadReader} 
072  * which wraps the provided {@link java.io.Reader}
073  * with the default lookahead limit of 128 characters.
074  * 
075  * @param reader the {@link java.io.Reader} to wrap
076  */
077 public LookAheadReader(Reader reader) {
078  this(reader, 128);
079   } 
080
081 /**
082  * Creates a new {@link LookAheadReader} 
083  * which wraps the provided {@link java.io.Reader}
084  * with specified lookahead limit.
085  * 
086  * @param reader the {@link java.io.Reader} to wrap
087  * @param capacity the look ahead limit (number of characters)
088  */
089 
090 public LookAheadReader(Reader reader, int capacity) {
091  this.reader=reader;
092   this.capacity=capacity;
093   arr=new char[capacity];
094   }
095 
096  private void add(char c) {
097  if (count==capacity)
098   throw new IllegalStateException("buffer full");
099    arr[tail]=c;
100    count++;
101    tail++;
102    if (tail==capacity)
103     tail=0;
104    if (mark!=-1 && tail==mark+1)
105     mark=-1;
106    ;} 
107  
108  /**
109  * Returns {@code true} if the next {@code char} that will be read is {@code c}.
110  * 
111  * @param c the {@code char} to test for
112  * @return {@code true} if the next {@code char} that will be read is {@code c}
113  * @throws IOException
114  */
115 public boolean at(char c) throws IOException {
116  if (eof)
117   return false;
118   fill(1);
119  return arr[head]==c;
120  } 
121
122 /**
123  * Returns {@code true} if the {@code n}<i>th</i> {@code char} that will be read 
124  * is {@code c}, 
125  * counting from 0 ({@code at(0, 'x')} tests the first {@code char} that will be read).
126  * 
127  * @param n
128  * @param c the {@code char} to test for
129  * @return {@code true} if the {@code n}<i>th</i> {@code char} that will be read 
130  * is {@code c}
131  * @throws IOException
132  */
133 public boolean at(int n, char c) throws IOException {
134  if (eof)
135   return false;
136  fill(n+1);
137  n+=head;
138  if (n>=capacity)
139   n-=capacity;
140  return arr[n]==c;
141  } 
142
143 /**
144  * Returns {@code true} if the next {@code chars}s that will be read 
145  * match the {@link java.lang.String} {@code s}.
146  * 
147  * @param s the {@link java.lang.String} to match
148  * @return {@code true} if the next {@code chars}s that will be read 
149  * match the {@link java.lang.String} {@code s}
150  * @throws IOException
151  */
152 public boolean at(String s) throws IOException {
153  if (eof) 
154   return false;
155    int len=s.length();
156  for (int i=0; i<len; i++) 
157      if (peek(i)!=s.charAt(i)) 
158       return false;
159    return true;  
160    }
161
162 /**
163  * Returns {@code true} if the next {@code chars} that will be read 
164  * is {@code #}.
165   * <p>
166  * A "comment" starts with {@code #}
167  * and runs until the end of the line.
168   * <p>
169  * Can easily be overridden to accommodate alternative definitions of "comment".
170  * @throws IOException
171  */
172 public boolean atComment() throws IOException {
173  return at('#');
174   }
175
176 /**
177   * Returns {@code true} if all the {@code chars}s have been read.
178   * 
179   * @return {@code true} if all the {@code chars}s have been read
180   * @throws IOException
181   */
182  public boolean atEOF() throws IOException {
183    return eof || peek()==(char) -1;
184    }
185
186 /**
187   * Returns {@code true} if all the next {@code char} that will be read is '\n' 
188   * or '\r' or if {@link #atEOF()}.
189   * 
190   * @return {@code true} if all the next {@code char} that will be read is '\n' 
191   * or '\r' or if {@link #atEOF()}
192   * @throws IOException
193   */
194  public boolean atEOL() throws IOException {
195    if (eof)
196     return true;
197   char next=peek();
198    return ((next=='\n')||(next=='\r'));
199    }
200
201 /**
202  * If the next {@code char} that will be read is {@code c},
203  * returns {@code true} and removes {@code c} from the input.
204  * 
205  * @param c the {@code true} to test for
206  * @return {@code true} if the next {@code char} that will be read is {@code c}
207  * @throws IOException
208  */
209 public boolean atSkip(char c) throws IOException {
210  if (eof)
211   return false;
212  fill(1);
213  if (arr[head]!=c) 
214    return false;
215  head++;
216  if (head==capacity)
217   head=0;
218  count--;
219  return true;
220  }
221
222 /**
223  * If the next {@code char}s that will be read match the {@link java.lang.String} {@code s},
224  * returns {@code true} and removes the  {@code char}s from the input.
225  * 
226  * @param s the {@link java.lang.String} to match
227  * @return {@code true} if the next {@code char} that will be read match the {@link java.lang.String} {@code s}
228  * @throws IOException
229  */
230 public boolean atSkip(String s) throws IOException {
231    if (!at(s)) 
232     return false;
233    skip(s.length());
234    return true;
235    }
236
237 /**
238  * If the next characters to be read will be a "comment",
239  * returns {@code true}
240  * and removes the comment from the input.
241   * <p>
242  * A "comment" starts with {@code #}
243  * and runs until the end of the line.
244   * <p>
245  * Can easily be overridden to accommodate alternative definitions of "comment".
246  * @throws IOException
247  */
248 public boolean atSkipComment() throws IOException {
249    if (!atSkip('#'))
250     return false;
251    while (!atEOL())
252     skip(1);
253    return true;
254    }
255  
256  /**
257  * If the next {@code char} that will be read will be a white space character,
258  * returns {@code true}
259  * and removes the {@code char} from the input.
260   * <p>
261  * Can easily be overridden to accommodate alternative definitions of "white space".
262  * @return {@code true} if the next {@code char} that will be read will be a white space character
263  * @throws IOException
264  */ 
265 public boolean atSkipWhitespaceChar() throws IOException {
266  if (atWhitespace()) {
267   skip(1);
268   return true;
269   }
270  return false;
271   }
272 
273  /**
274  * If the "word" that will be read matches {@code w} (and no other word).
275  * returns true
276  * and removes the word from the input.
277  * Case sensitive.
278   * <p>
279  * A "word" starts with a letter or underscore
280  * and contains only letters, underscores and digits.
281   * <p>
282  * Can easily be overridden to accommodate alternative definitions of "word".
283  * 
284  * @param w the "word" to match
285  * @return {@code true} if the "word" that will be read matches {@code w} (and no other word)
286  * @throws IOException
287  */
288  public boolean atSkipWord(String w) throws IOException {
289    if (!atWord(w)) 
290      return false;
291    skip(w.length());
292    return true;
293    }
294  
295  /**
296  * Returns {@code true} if the next {@code char} that will be read will be a white space character.
297   * <p>
298  * Can easily be overridden to accommodate alternative definitions of "white space".
299  * @return {@code true} if the next {@code char} that will be read will be a white space character
300  * @throws IOException
301  */ 
302  public boolean atWhitespace() throws IOException {
303    return isWhitespace(peek());
304    }
305  
306  /**
307  * Returns true if the "word" that will be read matches {@code w} (and no other word).
308  * Case sensitive.
309   * <p>
310  * A "word" starts with a letter or underscore
311  * and contains only letters, underscores and digits.
312   * <p>
313  * Can easily be overridden to accommodate alternative definitions of "word". 
314   * <p>
315  * @param w the "word" to match
316  * @return {@code true} if the "word" that will be read matches {@code w} (and no other word)
317  * @throws IOException
318  */
319  public boolean atWord(String w) throws IOException {
320    if (!at(w))
321     return false;
322    if (isIdentifierChar(peek(w.length())))
323      return false;
324    return true; 
325    }
326  
327  /**
328   * Closes the {@link LookAheadReader} and the wrapped {@link java.lang.Reader}.
329   * 
330   * @throws IOException
331   */
332  @Override
333 public void close() throws IOException {
334    reader.close();
335    arr=null;
336    eof=true;
337    }
338  
339  private void fill(int n) throws IOException {
340   if (eof)
341     return;
342   if (n>capacity)
343    throw new IOException("requested lookAhead exceeds buffer capacity");
344   while (count<n) 
345     add((char) reader.read());
346    }
347
348  /**
349   * Marks the present position in the stream. 
350   * Subsequent calls to {@link #reset()} will reposition the stream to this point. 
351   *
352   * @param readAheadLimit Limit on the number of characters that may be read while still preserving the mark. 
353   * After reading this many characters, attempting to reset the stream may fail.
354   * @throws IOException if {@code readAheadLimit} exceeds buffer capacity
355   */
356  @Override
357  public void mark(int readAheadLimit) throws IOException {
358   if (readAheadLimit>capacity)
359    throw new IOException("requested readAheadLimit exceeds buffer capacity");
360   mark=head;
361   ;}
362  
363  /**
364   * Returns {@code true} if this reader supports the {@link #mark(int)} operation, which it does, 
365   * but only up to the maximum capacity of the buffer.
366   *  
367   * @return {@code true}
368   */
369  @Override
370  public boolean markSupported() {
371   return true;
372   }
373
374 /**
375  * Returns the next {@code char} that will be read.
376  * 
377  * @return the next {@code char} that will be read
378  * @throws IOException
379  */
380 public char peek() throws IOException {
381    if (eof)
382     return (char) -1;
383    fill(1);
384  return arr[head];
385   }
386
387 /**
388  * Returns the {@code n}<i>th</i> {@code char} that will be read,
389  * counting from 0 ({@code peek(0)} returns the first {@code char} that will be read).
390  * 
391  * @return the {@code n}<i>th</i> {@code char} that will be read
392  * @throws IOException
393  */ 
394 public char peek(int n) throws IOException {
395    if (eof)
396     return (char) -1;
397    fill(n+1);
398  n+=head;
399  if (n>=capacity)
400   n-=capacity;
401  return arr[n];
402   }
403
404 /**
405  * Reads a single character. 
406  * This method will block until a character is available, 
407  * an I/O error occurs, 
408  * or the end of the stream is reached. 
409  * 
410  * @return The character read, as an integer in the range 0 to 65535 (0x00-0xffff), 
411  * or -1 if the end of the stream has been reached
412  * @throws IOException
413  */  
414  @Override
415 public int read() throws IOException {
416    if (eof)
417      return -1;
418    int c;
419    if (count>0)
420      c=remove();
421    else
422      c=reader.read();
423    if (c==-1)
424      eof=true;
425    return c;
426    }
427
428 /**
429  * Reads characters into a portion of an array. 
430  * This method will block until some input is available, 
431  * an I/O error occurs, 
432  * or the end of the stream is reached.
433   * If len is zero, then no characters are read and 0 is returned; otherwise, 
434   * there is an attempt to read at least one character. 
435   * If no character is available because the stream is at its end, the value -1 is returned; otherwise, 
436   * at least one character is read and stored into cbuf.
437  * 
438  * @param cbuf Destination buffer
439  * @param off Offset at which to start storing characters
440  * @param len Maximum number of characters to read
441  * @return The number of characters read, 
442  * or -1 if the end of the stream has been reached
443  * @throws IOException
444  */
445 @Override
446 public int read(char[] cbuf, int off, int len) throws IOException {
447  if (len==0)
448   return 0;
449  if (eof)
450   return -1;
451  int cread=0;
452  int oldSize=count;
453  while (cread<len && cread<oldSize) {
454   cbuf[off+cread]=remove();
455   cread++;
456    }
457  return reader.read(cbuf, off+cread, len-cread)+cread;
458   }
459
460 /**
461  * Reads a single character. 
462  * This method will block until a character is available, 
463  * an I/O error occurs, 
464  * or the end of the stream is reached. 
465  * 
466  * @return The character read, as {@code char}, 
467  * or ({@code char})-1 if the end of the stream has been reached
468  * @throws IOException
469  */  
470 public char readChar() throws IOException {
471   return (char) read();
472   }
473 
474 /**
475  * Returns {@code true} if this reader is ready to be read.
476  * 
477  * @return {@code true} if the next read() is guaranteed not to block for input,
478  * {@code false} otherwise. 
479  * Note that returning {@code false} does not guarantee that the next read will block.
480  * @throws IOException
481  */
482 @Override
483 public boolean ready() throws IOException {
484  return count>0 || reader.ready();
485   }
486 
487 private char remove() {
488  if (count==0)
489   throw new IllegalStateException("buffer empty");
490  char c=arr[head];
491  head++;
492  if (head==capacity)
493   head=0;
494  count--;
495  return c;
496   }
497 
498 /**
499  * Repositions the reader to the mark.
500  * 
501  * @throws IOException if the reader was not marked or 
502  * if the mark was invalidated by reading past the read ahead limit.
503  */
504 @Override
505 public void reset() throws IOException {
506  if (mark==-1)
507   throw new IOException("not marked or mark invalidated");
508  head=mark;
509  }
510 
511 /**
512  * Skips characters.
513  * 
514  * @param n the number of characters to skip
515  * @return the number of characters actually skipped
516  * @throws IOException
517  */
518 @Override
519 public long skip(long n) throws IOException {
520  if (eof)
521    return 0;
522  if (n>=count) {
523   head=0;
524   tail=0;
525   var oldSize=count;
526   count=0;
527   return reader.skip(n-oldSize)+oldSize;
528    }
529    head+=n;
530  while (head>=capacity)
531   head-=capacity;
532    count-=n;
533    return n;
534    }
535 
536 /**
537  * Skips white space and comments.
538   * <p>
539  * The class can easily be subclassed to accommodate alternative definitions of 
540  * white space and comments. 
541  */
542  public void skipWhitespaceAndComments() throws IOException {
543    while (true) {
544     if (atSkipWhitespaceChar())
545      ;
546     else if (atSkipComment())
547      ;
548     else
549      break;
550      }
551    }
552  }