001/* 002Copyright (c) 2022-2023 Steve Shering 003 004All rights reserved. 005 006As a special exception, the copyright holder of this software gives you permission 007to use this software for personal, not-for-profit purposes. 008 009For any other purpose, a license must be obtained from the copyright holder. 010 011This copyright notice and this permission notice must be included in all copies 012of this software, including copies of parts of this software. 013 014THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 015IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 016FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 017AUTHOR OR COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 018LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 019OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 020THE SOFTWARE. 021*/ 022package net.sherst.io; 023 024import java.io.IOException; 025import java.io.Reader; 026 027/** 028 * Wrapper for any {@link java.io.Reader} 029 * which provides look-ahead methods ({@code at}, {@code atSkip}, {@code atWordSkip}, {@code peek}) 030 * and some useful utility methods ({@link #readChar}, {@link #atEOF}, {@link #skipWhitespaceAndComments()}). 031 * <p> 032 * Makes most parsing tasks easy and efficient, 033 * including parsing without a separate lexer 034 * and parsing nested grammars. 035 * <p> 036 * The capacity is the maximum lookahead (in number of characters). 037 * This must be specified when the {@link LookAheadReader} is created. 038 * {@link LookAheadReader} uses a fast but fixed-size ring buffer. 039 * <p> 040 * The intended use is parsing source code from start to finish in one thread. Not thread safe. 041 * 042 * @author sherstDotNet@yahoo.com 043 */ 044public class LookAheadReader extends Reader { 045 private char[] arr; 046 private int capacity=0; 047 private boolean eof=false; 048 private int head=0; 049 private int mark=-1; 050 private int count=0; 051 private Reader reader; 052 private int tail=0; 053 054 static public boolean isDecimalDigit(char c) { 055 return (c>='0') && (c<='9'); 056 } 057 058 static public boolean isIdentifierChar(char c) { 059 return isIdentifierStart(c) || isDecimalDigit(c); 060 } 061 062 static public boolean isIdentifierStart(char c) { 063 return ((c>='A')&&(c<='Z')) || (c=='_')||((c>='a')&&(c<='z')||(c=='-')); 064 } 065 066 public static boolean isWhitespace(char c) { 067 return (c==' ')||(c=='\n')||(c=='\r')||(c=='\t')||(c==65279); 068 } 069 070 /** 071 * Creates a new {@link LookAheadReader} 072 * which wraps the provided {@link java.io.Reader} 073 * with the default lookahead limit of 128 characters. 074 * 075 * @param reader the {@link java.io.Reader} to wrap 076 */ 077 public LookAheadReader(Reader reader) { 078 this(reader, 128); 079 } 080 081 /** 082 * Creates a new {@link LookAheadReader} 083 * which wraps the provided {@link java.io.Reader} 084 * with specified lookahead limit. 085 * 086 * @param reader the {@link java.io.Reader} to wrap 087 * @param capacity the look ahead limit (number of characters) 088 */ 089 090 public LookAheadReader(Reader reader, int capacity) { 091 this.reader=reader; 092 this.capacity=capacity; 093 arr=new char[capacity]; 094 } 095 096 private void add(char c) { 097 if (count==capacity) 098 throw new IllegalStateException("buffer full"); 099 arr[tail]=c; 100 count++; 101 tail++; 102 if (tail==capacity) 103 tail=0; 104 if (mark!=-1 && tail==mark+1) 105 mark=-1; 106 ;} 107 108 /** 109 * Returns {@code true} if the next {@code char} that will be read is {@code c}. 110 * 111 * @param c the {@code char} to test for 112 * @return {@code true} if the next {@code char} that will be read is {@code c} 113 * @throws IOException 114 */ 115 public boolean at(char c) throws IOException { 116 if (eof) 117 return false; 118 fill(1); 119 return arr[head]==c; 120 } 121 122 /** 123 * Returns {@code true} if the {@code n}<i>th</i> {@code char} that will be read 124 * is {@code c}, 125 * counting from 0 ({@code at(0, 'x')} tests the first {@code char} that will be read). 126 * 127 * @param n 128 * @param c the {@code char} to test for 129 * @return {@code true} if the {@code n}<i>th</i> {@code char} that will be read 130 * is {@code c} 131 * @throws IOException 132 */ 133 public boolean at(int n, char c) throws IOException { 134 if (eof) 135 return false; 136 fill(n+1); 137 n+=head; 138 if (n>=capacity) 139 n-=capacity; 140 return arr[n]==c; 141 } 142 143 /** 144 * Returns {@code true} if the next {@code chars}s that will be read 145 * match the {@link java.lang.String} {@code s}. 146 * 147 * @param s the {@link java.lang.String} to match 148 * @return {@code true} if the next {@code chars}s that will be read 149 * match the {@link java.lang.String} {@code s} 150 * @throws IOException 151 */ 152 public boolean at(String s) throws IOException { 153 if (eof) 154 return false; 155 int len=s.length(); 156 for (int i=0; i<len; i++) 157 if (peek(i)!=s.charAt(i)) 158 return false; 159 return true; 160 } 161 162 /** 163 * Returns {@code true} if the next {@code chars} that will be read 164 * is {@code #}. 165 * <p> 166 * A "comment" starts with {@code #} 167 * and runs until the end of the line. 168 * <p> 169 * Can easily be overridden to accommodate alternative definitions of "comment". 170 * @throws IOException 171 */ 172 public boolean atComment() throws IOException { 173 return at('#'); 174 } 175 176 /** 177 * Returns {@code true} if all the {@code chars}s have been read. 178 * 179 * @return {@code true} if all the {@code chars}s have been read 180 * @throws IOException 181 */ 182 public boolean atEOF() throws IOException { 183 return eof || peek()==(char) -1; 184 } 185 186 /** 187 * Returns {@code true} if all the next {@code char} that will be read is '\n' 188 * or '\r' or if {@link #atEOF()}. 189 * 190 * @return {@code true} if all the next {@code char} that will be read is '\n' 191 * or '\r' or if {@link #atEOF()} 192 * @throws IOException 193 */ 194 public boolean atEOL() throws IOException { 195 if (eof) 196 return true; 197 char next=peek(); 198 return ((next=='\n')||(next=='\r')); 199 } 200 201 /** 202 * If the next {@code char} that will be read is {@code c}, 203 * returns {@code true} and removes {@code c} from the input. 204 * 205 * @param c the {@code true} to test for 206 * @return {@code true} if the next {@code char} that will be read is {@code c} 207 * @throws IOException 208 */ 209 public boolean atSkip(char c) throws IOException { 210 if (eof) 211 return false; 212 fill(1); 213 if (arr[head]!=c) 214 return false; 215 head++; 216 if (head==capacity) 217 head=0; 218 count--; 219 return true; 220 } 221 222 /** 223 * If the next {@code char}s that will be read match the {@link java.lang.String} {@code s}, 224 * returns {@code true} and removes the {@code char}s from the input. 225 * 226 * @param s the {@link java.lang.String} to match 227 * @return {@code true} if the next {@code char} that will be read match the {@link java.lang.String} {@code s} 228 * @throws IOException 229 */ 230 public boolean atSkip(String s) throws IOException { 231 if (!at(s)) 232 return false; 233 skip(s.length()); 234 return true; 235 } 236 237 /** 238 * If the next characters to be read will be a "comment", 239 * returns {@code true} 240 * and removes the comment from the input. 241 * <p> 242 * A "comment" starts with {@code #} 243 * and runs until the end of the line. 244 * <p> 245 * Can easily be overridden to accommodate alternative definitions of "comment". 246 * @throws IOException 247 */ 248 public boolean atSkipComment() throws IOException { 249 if (!atSkip('#')) 250 return false; 251 while (!atEOL()) 252 skip(1); 253 return true; 254 } 255 256 /** 257 * If the next {@code char} that will be read will be a white space character, 258 * returns {@code true} 259 * and removes the {@code char} from the input. 260 * <p> 261 * Can easily be overridden to accommodate alternative definitions of "white space". 262 * @return {@code true} if the next {@code char} that will be read will be a white space character 263 * @throws IOException 264 */ 265 public boolean atSkipWhitespaceChar() throws IOException { 266 if (atWhitespace()) { 267 skip(1); 268 return true; 269 } 270 return false; 271 } 272 273 /** 274 * If the "word" that will be read matches {@code w} (and no other word). 275 * returns true 276 * and removes the word from the input. 277 * Case sensitive. 278 * <p> 279 * A "word" starts with a letter or underscore 280 * and contains only letters, underscores and digits. 281 * <p> 282 * Can easily be overridden to accommodate alternative definitions of "word". 283 * 284 * @param w the "word" to match 285 * @return {@code true} if the "word" that will be read matches {@code w} (and no other word) 286 * @throws IOException 287 */ 288 public boolean atSkipWord(String w) throws IOException { 289 if (!atWord(w)) 290 return false; 291 skip(w.length()); 292 return true; 293 } 294 295 /** 296 * Returns {@code true} if the next {@code char} that will be read will be a white space character. 297 * <p> 298 * Can easily be overridden to accommodate alternative definitions of "white space". 299 * @return {@code true} if the next {@code char} that will be read will be a white space character 300 * @throws IOException 301 */ 302 public boolean atWhitespace() throws IOException { 303 return isWhitespace(peek()); 304 } 305 306 /** 307 * Returns true if the "word" that will be read matches {@code w} (and no other word). 308 * Case sensitive. 309 * <p> 310 * A "word" starts with a letter or underscore 311 * and contains only letters, underscores and digits. 312 * <p> 313 * Can easily be overridden to accommodate alternative definitions of "word". 314 * <p> 315 * @param w the "word" to match 316 * @return {@code true} if the "word" that will be read matches {@code w} (and no other word) 317 * @throws IOException 318 */ 319 public boolean atWord(String w) throws IOException { 320 if (!at(w)) 321 return false; 322 if (isIdentifierChar(peek(w.length()))) 323 return false; 324 return true; 325 } 326 327 /** 328 * Closes the {@link LookAheadReader} and the wrapped {@link java.lang.Reader}. 329 * 330 * @throws IOException 331 */ 332 @Override 333 public void close() throws IOException { 334 reader.close(); 335 arr=null; 336 eof=true; 337 } 338 339 private void fill(int n) throws IOException { 340 if (eof) 341 return; 342 if (n>capacity) 343 throw new IOException("requested lookAhead exceeds buffer capacity"); 344 while (count<n) 345 add((char) reader.read()); 346 } 347 348 /** 349 * Marks the present position in the stream. 350 * Subsequent calls to {@link #reset()} will reposition the stream to this point. 351 * 352 * @param readAheadLimit Limit on the number of characters that may be read while still preserving the mark. 353 * After reading this many characters, attempting to reset the stream may fail. 354 * @throws IOException if {@code readAheadLimit} exceeds buffer capacity 355 */ 356 @Override 357 public void mark(int readAheadLimit) throws IOException { 358 if (readAheadLimit>capacity) 359 throw new IOException("requested readAheadLimit exceeds buffer capacity"); 360 mark=head; 361 ;} 362 363 /** 364 * Returns {@code true} if this reader supports the {@link #mark(int)} operation, which it does, 365 * but only up to the maximum capacity of the buffer. 366 * 367 * @return {@code true} 368 */ 369 @Override 370 public boolean markSupported() { 371 return true; 372 } 373 374 /** 375 * Returns the next {@code char} that will be read. 376 * 377 * @return the next {@code char} that will be read 378 * @throws IOException 379 */ 380 public char peek() throws IOException { 381 if (eof) 382 return (char) -1; 383 fill(1); 384 return arr[head]; 385 } 386 387 /** 388 * Returns the {@code n}<i>th</i> {@code char} that will be read, 389 * counting from 0 ({@code peek(0)} returns the first {@code char} that will be read). 390 * 391 * @return the {@code n}<i>th</i> {@code char} that will be read 392 * @throws IOException 393 */ 394 public char peek(int n) throws IOException { 395 if (eof) 396 return (char) -1; 397 fill(n+1); 398 n+=head; 399 if (n>=capacity) 400 n-=capacity; 401 return arr[n]; 402 } 403 404 /** 405 * Reads a single character. 406 * This method will block until a character is available, 407 * an I/O error occurs, 408 * or the end of the stream is reached. 409 * 410 * @return The character read, as an integer in the range 0 to 65535 (0x00-0xffff), 411 * or -1 if the end of the stream has been reached 412 * @throws IOException 413 */ 414 @Override 415 public int read() throws IOException { 416 if (eof) 417 return -1; 418 int c; 419 if (count>0) 420 c=remove(); 421 else 422 c=reader.read(); 423 if (c==-1) 424 eof=true; 425 return c; 426 } 427 428 /** 429 * Reads characters into a portion of an array. 430 * This method will block until some input is available, 431 * an I/O error occurs, 432 * or the end of the stream is reached. 433 * If len is zero, then no characters are read and 0 is returned; otherwise, 434 * there is an attempt to read at least one character. 435 * If no character is available because the stream is at its end, the value -1 is returned; otherwise, 436 * at least one character is read and stored into cbuf. 437 * 438 * @param cbuf Destination buffer 439 * @param off Offset at which to start storing characters 440 * @param len Maximum number of characters to read 441 * @return The number of characters read, 442 * or -1 if the end of the stream has been reached 443 * @throws IOException 444 */ 445 @Override 446 public int read(char[] cbuf, int off, int len) throws IOException { 447 if (len==0) 448 return 0; 449 if (eof) 450 return -1; 451 int cread=0; 452 int oldSize=count; 453 while (cread<len && cread<oldSize) { 454 cbuf[off+cread]=remove(); 455 cread++; 456 } 457 return reader.read(cbuf, off+cread, len-cread)+cread; 458 } 459 460 /** 461 * Reads a single character. 462 * This method will block until a character is available, 463 * an I/O error occurs, 464 * or the end of the stream is reached. 465 * 466 * @return The character read, as {@code char}, 467 * or ({@code char})-1 if the end of the stream has been reached 468 * @throws IOException 469 */ 470 public char readChar() throws IOException { 471 return (char) read(); 472 } 473 474 /** 475 * Returns {@code true} if this reader is ready to be read. 476 * 477 * @return {@code true} if the next read() is guaranteed not to block for input, 478 * {@code false} otherwise. 479 * Note that returning {@code false} does not guarantee that the next read will block. 480 * @throws IOException 481 */ 482 @Override 483 public boolean ready() throws IOException { 484 return count>0 || reader.ready(); 485 } 486 487 private char remove() { 488 if (count==0) 489 throw new IllegalStateException("buffer empty"); 490 char c=arr[head]; 491 head++; 492 if (head==capacity) 493 head=0; 494 count--; 495 return c; 496 } 497 498 /** 499 * Repositions the reader to the mark. 500 * 501 * @throws IOException if the reader was not marked or 502 * if the mark was invalidated by reading past the read ahead limit. 503 */ 504 @Override 505 public void reset() throws IOException { 506 if (mark==-1) 507 throw new IOException("not marked or mark invalidated"); 508 head=mark; 509 } 510 511 /** 512 * Skips characters. 513 * 514 * @param n the number of characters to skip 515 * @return the number of characters actually skipped 516 * @throws IOException 517 */ 518 @Override 519 public long skip(long n) throws IOException { 520 if (eof) 521 return 0; 522 if (n>=count) { 523 head=0; 524 tail=0; 525 var oldSize=count; 526 count=0; 527 return reader.skip(n-oldSize)+oldSize; 528 } 529 head+=n; 530 while (head>=capacity) 531 head-=capacity; 532 count-=n; 533 return n; 534 } 535 536 /** 537 * Skips white space and comments. 538 * <p> 539 * The class can easily be subclassed to accommodate alternative definitions of 540 * white space and comments. 541 */ 542 public void skipWhitespaceAndComments() throws IOException { 543 while (true) { 544 if (atSkipWhitespaceChar()) 545 ; 546 else if (atSkipComment()) 547 ; 548 else 549 break; 550 } 551 } 552 }