1 /* 2 shlex, simple shell-like lexical analysis library 3 Copyright (C) 2019 Victor Porton 4 5 This program is free software: you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation, either version 3 of the License, or 8 (at your option) any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program. If not, see <https://www.gnu.org/licenses/>. 17 18 This code was a rewrite of a Python 3.7 module with the same name: 19 Copyright © 2001-2019 Python Software Foundation; All Rights Reserved 20 */ 21 22 module shlex; 23 24 import std.typecons; 25 import std.conv; 26 import std.string; 27 import std.utf; 28 import std.regex; 29 import std.array; 30 import std.range.interfaces; 31 import std.range.primitives; 32 import std.container; 33 import std.container.dlist; 34 import std.algorithm; 35 import std.file; 36 import std.path; 37 import std.stdio : write, writeln; 38 import pure_dependency.providers; 39 import struct_params; 40 41 // TODO: use moveFront()/moveBack() 42 43 alias ShlexStream = InputRange!(const dchar); // Unicode stream 44 45 class ShlexFile : InputRange!dchar { 46 private string text; 47 48 /// The current version reads the file entirely 49 this(string name) { 50 text = readText(name); 51 } 52 53 override @property dchar front() { 54 return text.front; 55 } 56 57 override dchar moveFront() { 58 return text.moveFront(); 59 } 60 61 override void popFront() { 62 return text.popFront(); 63 } 64 65 override @property bool empty() { 66 return text.empty; 67 } 68 69 override int opApply(scope int delegate(dchar) dg) { 70 int res; 71 for (auto r = text; !r.empty; r.popFront()) { 72 res = dg(r.front); 73 if (res) break; 74 } 75 return res; 76 } 77 78 override int opApply(scope int delegate(size_t, dchar) dg) { 79 int res; 80 size_t i = 0; 81 for (auto r = text; !r.empty; r.popFront()) { 82 res = dg(i, r.front); 83 if (res) break; 84 i++; 85 } 86 return res; 87 } 88 89 /// 90 void close() { } // we have already read the file 91 } 92 93 private void skipLine(ShlexStream stream) { 94 while (!stream.empty && stream.front != '\n') stream.popFront(); 95 if (!stream.empty && stream.front == '\n') stream.popFront(); 96 } 97 98 /// A lexical analyzer class for simple shell-like syntaxes 99 struct Shlex { 100 alias Posix = Flag!"posix"; 101 alias PunctuationChars = Flag!"PunctuationChars"; 102 alias Comments = Flag!"comments"; 103 104 private: 105 ShlexStream instream; 106 Nullable!string infile; 107 Posix posix; 108 Nullable!string eof; // seems not efficient 109 //bool delegate(string token) isEof; 110 auto commenters = new RedBlackTree!(immutable dchar)("#"); 111 RedBlackTree!(immutable dchar) wordchars; 112 static immutable whitespace = new RedBlackTree!(immutable dchar)(" \t\r\n"); 113 bool whitespaceSplit = false; 114 static immutable quotes = new RedBlackTree!(immutable dchar)("'\""); 115 static immutable escape = new RedBlackTree!(immutable dchar)("\\"); // char or string? 116 static immutable escapedquotes = new RedBlackTree!(immutable dchar)("\""); // char or string? 117 Nullable!dchar state = ' '; // a little inefficient? 118 auto pushback = DList!string(); // may be not the fastest 119 uint lineno; 120 ubyte debug_ = 0; 121 string token = ""; 122 auto filestack = DList!(Tuple!(Nullable!string, ShlexStream, uint))(); // may be not the fastest 123 Nullable!string source; // TODO: Represent no source just as an empty string? 124 auto punctuationChars = new RedBlackTree!(immutable dchar)(); 125 // _pushbackChars is a push back queue used by lookahead logic 126 auto _pushbackChars = DList!dchar(); // may be not the fastest 127 128 public: 129 @disable this(); 130 131 /** We don't support implicit stdin as `instream` as in Python. */ 132 this(ShlexStream instream, 133 Nullable!string infile = Nullable!string.init, 134 Posix posix = No.posix, 135 PunctuationChars punctuationCharsFlag = No.PunctuationChars, 136 bool whitespaceSplit = false) 137 { 138 this.instream = instream; 139 this.infile = infile; 140 this.posix = posix; 141 this.whitespaceSplit = whitespaceSplit; 142 if (!posix) eof = ""; 143 wordchars = new RedBlackTree!(immutable dchar)("abcdfeghijklmnopqrstuvwxyz" ~ "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_"); 144 if (posix) 145 wordchars.stableInsert("ßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ" ~ "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ"); 146 lineno = 1; 147 if(punctuationCharsFlag) 148 this.punctuationChars.stableInsert("();<>|&"); 149 if (punctuationCharsFlag) { 150 // these chars added because allowed in file names, args, wildcards 151 wordchars.stableInsert("~-./*?="); 152 // remove any punctuation chars from wordchars 153 // TODO: Isn't it better to use dstring? 154 wordchars = new RedBlackTree!(immutable dchar)(filter!(c => c !in punctuationChars)(wordchars.array)); 155 } 156 } 157 158 this(Stream)(Stream instream, 159 Nullable!string infile = Nullable!string.init, 160 Posix posix = No.posix, 161 PunctuationChars punctuationChars = No.PunctuationChars, 162 bool whitespaceSplit = false) 163 { 164 import std.conv; 165 // TODO: Inefficient to convert to dstring in memory. 166 this(cast (ShlexStream)inputRangeObject(cast (const dchar[])instream.dtext), infile, posix, punctuationChars, whitespaceSplit); 167 } 168 169 void dump() { 170 if (debug_ >= 3) { 171 // writeln("state='", state, "\' nextchar='", nextchar, "\' token='", token, '\''); 172 writeln("state='", state.get(), "\' token='", token, '\''); 173 } 174 } 175 176 /** Push a token onto the stack popped by the getToken method */ 177 void pushToken(string tok) { 178 if (debug_ >= 1) 179 writeln("shlex: pushing token " ~ tok); 180 pushback.insertFront(tok); 181 } 182 183 /** Push an input source onto the lexer's input source stack. */ 184 void pushSource(Stream)(Stream newstream, Nullable!string newfile = Nullable!string.init) { 185 pushSource(inputRangeObject(instream), newfile); 186 } 187 188 /** Push an input source onto the lexer's input source stack. */ 189 void pushSource(ShlexStream newstream, Nullable!string newfile = Nullable!string.init) { 190 filestack.insertFront(tuple(this.infile, this.instream, this.lineno)); 191 this.infile = newfile; 192 this.instream = newstream; 193 this.lineno = 1; 194 if (debug_) { 195 if (newfile.isNull) 196 writeln("shlex: pushing to stream %s".format(this.instream)); 197 else 198 writeln("shlex: pushing to file %s".format(this.infile)); 199 } 200 } 201 202 /** Pop the input source stack. */ 203 void popSource() { 204 (cast(ShlexFile)instream).close(); // a little messy 205 // use a tuple library? 206 auto t = filestack.front; 207 filestack.removeFront(); 208 infile = t[0]; 209 instream = t[1]; 210 lineno = t[2]; 211 if (debug_) 212 writeln("shlex: popping to %s, line %d".format(instream, lineno)); 213 state = ' '; 214 } 215 216 // TODO: Use empty string for None? 217 /** Get a token from the input stream (or from stack if it's nonempty). 218 Returns null value on eof. */ 219 Nullable!string getToken() { 220 if (!pushback.empty) { 221 immutable tok = pushback.front; 222 pushback.removeFront(); 223 if (debug_ >= 1) 224 writeln("shlex: popping token " ~ tok); 225 return nullable(tok); 226 } 227 // No pushback. Get a token. 228 Nullable!string raw = readToken(); 229 // Handle inclusions 230 if (!source.isNull && !source.get().empty) { 231 while (raw == source) { 232 auto spec = sourcehook(readToken().get()); 233 if (!spec.empty) { 234 auto newfile = spec[0]; 235 auto newstream = spec[1]; 236 pushSource(newstream, nullable(newfile)); 237 } 238 raw = getToken(); 239 } 240 } 241 // Maybe we got EOF instead? 242 while (eof == raw) { 243 if (filestack.empty) 244 return eof; 245 else { 246 popSource(); 247 raw = getToken(); 248 } 249 } 250 // Neither inclusion nor EOF 251 if (debug_ >= 1) { 252 if (eof != raw) 253 writeln("shlex: token=" ~ raw.get); 254 else 255 writeln("shlex: token=EOF"); 256 } 257 return raw; 258 } 259 260 int opApply(scope int delegate(ref string) dg) { 261 int result = 0; 262 while (true) { 263 auto r = getToken(); 264 if (r.isNull) break; 265 result = dg(r.get); 266 if (result) break; 267 } 268 return result; 269 } 270 271 // TODO: Use empty string for None? 272 Nullable!string readToken() { 273 bool quoted = false; 274 dchar escapedstate = ' '; // TODO: use an enum 275 while (true) { 276 if(debug_ >= 3) { 277 write("Iteration "); 278 dump(); 279 } 280 Nullable!dchar nextchar; 281 if (!punctuationChars.empty && !_pushbackChars.empty) { 282 nextchar = _pushbackChars.back; 283 _pushbackChars.removeBack(); 284 } else { 285 if (!instream.empty) { 286 nextchar = instream.front; 287 instream.popFront(); 288 } 289 } 290 if (nextchar == '\n') 291 ++lineno; 292 if (debug_ >= 3) 293 writeln("shlex: in state %s I see character: s".format(state.get(), nextchar)); 294 if (state.isNull) { 295 // TODO: Debugger shows that this is never reached. Is this code needed? 296 token = ""; // past end of file 297 break; 298 } else if (state.get() == ' ') { 299 if (nextchar.isNull) { 300 state.nullify(); // end of file 301 break; 302 } else if (nextchar.get in whitespace) { 303 if (debug_ >= 2) 304 writeln("shlex: I see whitespace in whitespace state"); 305 if ((token && !token.empty) || (posix && quoted)) 306 break; // emit current token 307 else 308 continue; 309 } else if (nextchar.get in commenters) { 310 instream.skipLine(); 311 ++lineno; 312 } else if (posix && nextchar.get in escape) { 313 escapedstate = 'a'; 314 state = nextchar.get(); 315 } else if (nextchar.get in wordchars) { 316 token = [nextchar.get].toUTF8; 317 state = 'a'; 318 } else if (nextchar.get in punctuationChars) { 319 token = [nextchar.get].toUTF8; 320 state = 'c'; 321 } else if (nextchar.get in quotes) { 322 if (!posix) token = [nextchar.get].toUTF8; 323 state = nextchar.get(); 324 } else if (whitespaceSplit) { 325 token = [nextchar.get].toUTF8; 326 state = 'a'; 327 } else { 328 token = [nextchar.get].toUTF8; 329 if (!token.empty || (posix && quoted)) 330 break; // emit current token 331 else 332 continue; 333 } 334 } else if (!state.isNull && state.get() in quotes) { 335 quoted = true; 336 if (nextchar.isNull) { // end of file 337 if (debug_ >= 2) 338 writeln("shlex: I see EOF in quotes state"); 339 // XXX what error should be raised here? 340 throw new Exception("No closing quotation"); 341 } 342 if (nextchar.get() == state.get()) { 343 if (!posix) { 344 token ~= nextchar.get(); 345 state = ' '; 346 break; 347 } else 348 state = 'a'; 349 } else if (posix && !nextchar.isNull && nextchar.get in escape && 350 !state.isNull && state.get in escapedquotes) { 351 escapedstate = state.get(); 352 state = nextchar.get(); 353 } else 354 token ~= nextchar.get(); 355 } else if (!state.isNull && state.get() in escape) { 356 if (nextchar.isNull) { // end of file 357 if (debug_ >= 2) 358 writeln("shlex: I see EOF in escape state"); 359 // XXX what error should be raised here? 360 throw new Exception("No escaped character"); 361 } 362 // In posix shells, only the quote itself or the escape 363 // character may be escaped within quotes. 364 if (escapedstate in quotes && nextchar.get() != state.get() && nextchar.get() != escapedstate) 365 token ~= state.get(); 366 token ~= nextchar.get(); 367 state = escapedstate; 368 } else if (!state.isNull && (state.get == 'a' || state.get == 'c')) { 369 if (nextchar.isNull) { 370 state.nullify(); // end of file 371 break; 372 } else if (nextchar.get in whitespace) { 373 if (debug_ >= 2) 374 writeln("shlex: I see whitespace in word state"); 375 state = ' '; 376 if (token || (posix && quoted)) 377 break; // emit current token 378 else 379 continue; 380 } else if (nextchar.get in commenters) { 381 instream.skipLine(); 382 ++lineno; 383 if (posix) { 384 state = ' '; 385 if (!token.empty || (posix && quoted)) 386 break; // emit current token 387 else 388 continue; 389 } 390 } else if (state == 'c') { 391 if (nextchar.get in punctuationChars) 392 token ~= nextchar.get(); 393 else { 394 if (!nextchar.get in whitespace) 395 _pushbackChars.insertBack(nextchar.get()); 396 state = ' '; 397 break; 398 } 399 } else if (posix && nextchar.get in quotes) 400 state = nextchar.get(); 401 else if (posix && nextchar.get in escape) { 402 escapedstate = 'a'; 403 state = nextchar.get(); 404 } else if (nextchar.get in wordchars || nextchar.get in quotes || whitespaceSplit) { 405 token ~= nextchar.get(); 406 } else { 407 if (punctuationChars.empty) 408 pushback.insertFront(nextchar.get.to!string); 409 else 410 _pushbackChars.insertBack(nextchar.get()); 411 if (debug_ >= 2) 412 writeln("shlex: I see punctuation in word state"); 413 state = ' '; 414 if (!token.empty || (posix && quoted)) 415 break; // emit current token 416 else 417 continue; 418 } 419 } 420 } 421 Nullable!string result = token; 422 //writeln('['~token~']'); 423 token = ""; 424 if (posix && !quoted && result == "") 425 result.nullify(); 426 if (debug_ > 1) { 427 if (!result.isNull && !result.get().empty) // TODO: can simplify? 428 writeln("shlex: raw token=" ~ result.get); 429 else 430 writeln("shlex: raw token=EOF"); 431 } 432 return result; 433 } 434 435 /** Hook called on a filename to be sourced.*/ 436 auto sourcehook(string newfile) { 437 if (newfile[0] == '"') 438 newfile = newfile[1..$-1]; 439 // This implements cpp-like semantics for relative-path inclusion. 440 if (!isAbsolute(newfile)) 441 newfile = buildPath(dirName(infile.get()), newfile); 442 return tuple(newfile, new ShlexFile(newfile)); 443 } 444 445 /** Emit a C-compiler-like, Emacs-friendly error-message leader. */ 446 string errorLeader(Nullable!string infile = Nullable!string.init, 447 Nullable!uint lineno=Nullable!uint.init) 448 { 449 if (infile.isNull) 450 infile = this.infile; 451 if (lineno.isNull) 452 lineno = this.lineno; 453 return "\"%s\", line %d: ".format(infile, lineno); 454 } 455 } 456 457 mixin StructParams!("ShlexParams", 458 ShlexStream, "instream", 459 Nullable!string, "infile", 460 Shlex.Posix, "posix", 461 Shlex.PunctuationChars, "punctuationCharsFlag", 462 bool, "whitespaceSplit"); 463 private ShlexParams.WithDefaults shlexDefaults = { infile: Nullable!string.init, 464 posix: No.posix, 465 punctuationCharsFlag: No.PunctuationChars, 466 whitespaceSplit: false }; 467 alias ShlexProvider = ProviderWithDefaults!(Callable!( 468 (ShlexStream instream, Nullable!string infile, 469 Shlex.Posix posix, 470 Shlex.PunctuationChars punctuationCharsFlag, 471 bool whitespaceSplit) => new Shlex(instream, infile, posix, punctuationCharsFlag, whitespaceSplit)), 472 ShlexParams, shlexDefaults); 473 474 template ShlexProviderStream(Stream) { 475 mixin StructParams!("ShlexParams", 476 Stream, "instream", 477 Nullable!string, "infile", 478 Shlex.Posix, "posix", 479 Shlex.PunctuationChars, "punctuationCharsFlag", 480 bool, "whitespaceSplit"); 481 private ShlexParams.WithDefaults shlexDefaults = { infile: Nullable!string.init, 482 posix: No.posix, 483 punctuationCharsFlag: No.PunctuationChars, 484 whitespaceSplit: false }; 485 alias ShlexProvider = ProviderWithDefaults!(Callable!( 486 (Stream instream, Nullable!string infile, 487 Shlex.Posix posix, 488 Shlex.PunctuationChars punctuationCharsFlag, 489 bool whitespaceSplit) => new Shlex(instream, infile, posix, punctuationCharsFlag, whitespaceSplit)), 490 ShlexParams, shlexDefaults); 491 } 492 493 // TODO: Use dependency injection. 494 string[] split(string s, Shlex.Comments comments = No.comments, Shlex.Posix posix = Yes.posix) { 495 scope Shlex lex = Shlex(s, Nullable!string.init, posix); // TODO: shorten 496 lex.whitespaceSplit = true; 497 if (!comments) 498 lex.commenters.clear(); 499 return lex.array; 500 } 501 502 unittest { 503 import core.sys.posix.sys.resource; 504 auto limit = rlimit(100*1000000, 100*1000000); 505 setrlimit(RLIMIT_AS, &limit); // prevent OS crash due out of memory 506 507 assert(split("") == []); 508 assert(split("l") == ["l"]); 509 assert(split("ls") == ["ls"]); 510 assert(split("ls -l 'somefile; ls -xz ~'") == ["ls", "-l", "somefile; ls -xz ~"]); 511 assert(split("ssh home 'somefile; ls -xz ~'") == ["ssh", "home", "somefile; ls -xz ~"]); 512 } 513 514 private immutable _findUnsafe = regex(r"[^[a-zA-Z0-9]@%+=:,./-]"); 515 516 /** Return a shell-escaped version of the string *s*. */ 517 string quote(string s) { 518 if (s.empty) 519 return "''"; 520 if (!matchFirst(s, _findUnsafe)) 521 return s; 522 523 // use single quotes, and put single quotes into double quotes 524 // the string $'b is then quoted as '$'"'"'b' 525 return '\'' ~ s.replace("'", "'\"'\"'") ~ '\''; 526 } 527 528 unittest { 529 assert(quote("") == "''"); 530 assert(quote("somefile; ls -xz ~") == "'somefile; ls -xz ~'"); 531 writeln(quote("'") == "''\"'\"''"); // TODO: Too long result (as inherited from the Python library) 532 } 533 534 void _printTokens(Shlex lexer) { 535 while (true) { 536 Nullable!string tt = lexer.getToken(); 537 if (tt.isNull || tt.get().empty) break; // TODO: can simplify? 538 writeln("Token: " ~ tt.get); 539 } 540 } 541