1 /* 2 shlex, simple shell-like lexical analysis library 3 Copyright (C) 2019 Victor Porton 4 5 This program is free software: you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation, either version 3 of the License, or 8 (at your option) any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program. If not, see <https://www.gnu.org/licenses/>. 17 18 This code was a rewrite of a Python 3.7 module with the same name: 19 Copyright © 2001-2019 Python Software Foundation; All Rights Reserved 20 */ 21 22 module shlex; 23 24 import std.typecons; 25 import std.conv; 26 import std.string; 27 import std.utf; 28 import std.regex; 29 import std.array; 30 import std.range.interfaces; 31 import std.range.primitives; 32 import std.container; 33 import std.container.dlist; 34 import std.algorithm; 35 import std.file; 36 import std.path; 37 import std.stdio : write, writeln; 38 39 // TODO: use moveFront()/moveBack() 40 41 alias ShlexStream = InputRange!(const dchar); // Unicode stream 42 43 class ShlexFile : InputRange!dchar { 44 private string text; 45 46 /// The current version reads the file entirely 47 this(string name) { 48 text = readText(name); 49 } 50 51 override @property dchar front() { 52 return text.front; 53 } 54 55 override dchar moveFront() { 56 return text.moveFront(); 57 } 58 59 override void popFront() { 60 return text.popFront(); 61 } 62 63 override @property bool empty() { 64 return text.empty; 65 } 66 67 override int opApply(scope int delegate(dchar) dg) { 68 int res; 69 for (auto r = text; !r.empty; r.popFront()) { 70 res = dg(r.front); 71 if (res) break; 72 } 73 return res; 74 } 75 76 override int opApply(scope int delegate(size_t, dchar) dg) { 77 int res; 78 size_t i = 0; 79 for (auto r = text; !r.empty; r.popFront()) { 80 res = dg(i, r.front); 81 if (res) break; 82 i++; 83 } 84 return res; 85 } 86 87 /// 88 void close() { } // we have already read the file 89 } 90 91 private void skipLine(ShlexStream stream) { 92 while (!stream.empty && stream.front != '\n') stream.popFront(); 93 if (!stream.empty && stream.front == '\n') stream.popFront(); 94 } 95 96 /// A lexical analyzer class for simple shell-like syntaxes 97 struct Shlex { 98 alias Posix = Flag!"posix"; 99 alias PunctuationChars = Flag!"PunctuationChars"; 100 alias Comments = Flag!"comments"; 101 102 private: 103 // TODO: Python shlex has some of the following as public instance variables (also check visibility of member functions) 104 ShlexStream instream; 105 Nullable!string infile; 106 Posix posix; 107 Nullable!string eof; // seems not efficient 108 //bool delegate(string token) isEof; 109 auto commenters = new RedBlackTree!(immutable dchar)("#"); 110 RedBlackTree!(immutable dchar) wordchars; 111 static immutable whitespace = new RedBlackTree!(immutable dchar)(" \t\r\n"); 112 bool whitespaceSplit = false; 113 static immutable quotes = new RedBlackTree!(immutable dchar)("'\""); 114 static immutable escape = new RedBlackTree!(immutable dchar)("\\"); // char or string? 115 static immutable escapedquotes = new RedBlackTree!(immutable dchar)("\""); // char or string? 116 Nullable!dchar state = ' '; // a little inefficient? 117 auto pushback = DList!string(); // may be not the fastest 118 uint lineno; 119 ubyte debug_ = 0; 120 string token = ""; 121 auto filestack = DList!(Tuple!(Nullable!string, ShlexStream, uint))(); // may be not the fastest 122 Nullable!string source; // TODO: Represent no source just as an empty string? 123 auto punctuationChars = new RedBlackTree!(immutable dchar)(); 124 // _pushbackChars is a push back queue used by lookahead logic 125 auto _pushbackChars = DList!dchar(); // may be not the fastest 126 127 public: 128 @disable this(); 129 130 /** We don't support implicit stdin as `instream` as in Python. */ 131 this(ShlexStream instream, 132 Nullable!string infile = Nullable!string.init, 133 Posix posix = No.posix, 134 PunctuationChars punctuationCharsFlag = No.PunctuationChars) 135 { 136 this.instream = instream; 137 this.infile = infile; 138 this.posix = posix; 139 if (!posix) eof = ""; 140 wordchars = new RedBlackTree!(immutable dchar)("abcdfeghijklmnopqrstuvwxyz" ~ "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_"); 141 if (posix) 142 wordchars.stableInsert("ßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ" ~ "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ"); 143 lineno = 1; 144 if(punctuationCharsFlag) 145 this.punctuationChars.stableInsert("();<>|&"); 146 if (punctuationCharsFlag) { 147 // these chars added because allowed in file names, args, wildcards 148 wordchars.stableInsert("~-./*?="); 149 // remove any punctuation chars from wordchars 150 // TODO: Isn't it better to use dstring? 151 wordchars = new RedBlackTree!(immutable dchar)(filter!(c => c !in punctuationChars)(wordchars.array)); 152 } 153 } 154 155 this(Stream)(Stream instream, 156 Nullable!string infile = Nullable!string.init, 157 Posix posix = No.posix, 158 PunctuationChars punctuationChars = No.PunctuationChars) 159 { 160 import std.conv; 161 // TODO: Inefficient to convert to dstring in memory. 162 this(cast (ShlexStream)inputRangeObject(cast (const dchar[])instream.dtext), infile, posix, punctuationChars); 163 } 164 165 void dump() { 166 if (debug_ >= 3) { 167 // writeln("state='", state, "\' nextchar='", nextchar, "\' token='", token, '\''); 168 writeln("state='", state, "\' token='", token, '\''); 169 } 170 } 171 172 /** Push a token onto the stack popped by the getToken method */ 173 void pushToken(string tok) { 174 if (debug_ >= 1) 175 writeln("shlex: pushing token " ~ tok); 176 pushback.insertFront(tok); 177 } 178 179 /** Push an input source onto the lexer's input source stack. */ 180 void pushSource(Stream)(Stream newstream, Nullable!string newfile = Nullable!string.init) { 181 pushSource(inputRangeObject(instream), newfile); 182 } 183 184 /** Push an input source onto the lexer's input source stack. */ 185 void pushSource(ShlexStream newstream, Nullable!string newfile = Nullable!string.init) { 186 filestack.insertFront(tuple(this.infile, this.instream, this.lineno)); 187 this.infile = newfile; 188 this.instream = newstream; 189 this.lineno = 1; 190 if (debug_) { 191 if (newfile.isNull) 192 writeln("shlex: pushing to stream %s".format(this.instream)); 193 else 194 writeln("shlex: pushing to file %s".format(this.infile)); 195 } 196 } 197 198 /** Pop the input source stack. */ 199 void popSource() { 200 (cast(ShlexFile)instream).close(); // a little messy 201 // use a tuple library? 202 auto t = filestack.front; 203 filestack.removeFront(); 204 infile = t[0]; 205 instream = t[1]; 206 lineno = t[2]; 207 if (debug_) 208 writeln("shlex: popping to %s, line %d".format(instream, lineno)); 209 state = ' '; 210 } 211 212 // TODO: Use empty string for None? 213 /** Get a token from the input stream (or from stack if it's nonempty). 214 Returns null value on eof. */ 215 Nullable!string getToken() { 216 if (!pushback.empty) { 217 immutable tok = pushback.front; 218 pushback.removeFront(); 219 if (debug_ >= 1) 220 writeln("shlex: popping token " ~ tok); 221 return nullable(tok); 222 } 223 // No pushback. Get a token. 224 Nullable!string raw = readToken(); 225 // Handle inclusions 226 if (!source.isNull && !source.empty) { 227 while (raw == source) { 228 auto spec = sourcehook(readToken()); 229 if (!spec.empty) { 230 auto newfile = spec[0]; 231 auto newstream = spec[1]; 232 pushSource(newstream, nullable(newfile)); 233 } 234 raw = getToken(); 235 } 236 } 237 // Maybe we got EOF instead? 238 while (eof == raw) { 239 if (filestack.empty) 240 return eof; 241 else { 242 popSource(); 243 raw = getToken(); 244 } 245 } 246 // Neither inclusion nor EOF 247 if (debug_ >= 1) { 248 if (eof != raw) 249 writeln("shlex: token=" ~ raw); 250 else 251 writeln("shlex: token=EOF"); 252 } 253 return raw; 254 } 255 256 int opApply(scope int delegate(ref string) dg) { 257 int result = 0; 258 while (true) { 259 auto r = getToken(); 260 if (r.isNull) break; 261 result = dg(r.get); 262 if (result) break; 263 } 264 return result; 265 } 266 267 // TODO: Use empty string for None? 268 Nullable!string readToken() { 269 bool quoted = false; 270 dchar escapedstate = ' '; // TODO: use an enum 271 while (true) { 272 if(debug_ >= 3) { 273 write("Iteration "); 274 dump(); 275 } 276 Nullable!dchar nextchar; 277 if (!punctuationChars.empty && !_pushbackChars.empty) { 278 nextchar = _pushbackChars.back; 279 _pushbackChars.removeBack(); 280 } else { 281 if (!instream.empty) { 282 nextchar = instream.front; 283 instream.popFront(); 284 } 285 } 286 if (nextchar == '\n') 287 ++lineno; 288 if (debug_ >= 3) 289 writeln("shlex: in state %s I see character: %s".format(state, nextchar)); 290 if (state.isNull) { 291 // TODO: Debugger shows that this is never reached. Is this code needed? 292 token = ""; // past end of file 293 break; 294 } else if (state == ' ') { 295 if (nextchar.isNull) { 296 state.nullify(); // end of file 297 break; 298 } else if (nextchar.get in whitespace) { 299 if (debug_ >= 2) 300 writeln("shlex: I see whitespace in whitespace state"); 301 if ((token && !token.empty) || (posix && quoted)) 302 break; // emit current token 303 else 304 continue; 305 } else if (nextchar.get in commenters) { 306 instream.skipLine(); 307 ++lineno; 308 } else if (posix && nextchar.get in escape) { 309 escapedstate = 'a'; 310 state = nextchar; 311 } else if (nextchar.get in wordchars) { 312 token = [nextchar.get].toUTF8; 313 state = 'a'; 314 } else if (nextchar.get in punctuationChars) { 315 token = [nextchar.get].toUTF8; 316 state = 'c'; 317 } else if (nextchar.get in quotes) { 318 if (!posix) token = [nextchar.get].toUTF8; 319 state = nextchar; 320 } else if (whitespaceSplit) { 321 token = [nextchar.get].toUTF8; 322 state = 'a'; 323 } else { 324 token = [nextchar.get].toUTF8; 325 if (!token.empty || (posix && quoted)) 326 break; // emit current token 327 else 328 continue; 329 } 330 } else if (!state.isNull && state in quotes) { 331 quoted = true; 332 if (nextchar.isNull) { // end of file 333 if (debug_ >= 2) 334 writeln("shlex: I see EOF in quotes state"); 335 // XXX what error should be raised here? 336 throw new Exception("No closing quotation"); 337 } 338 if (nextchar == state) { 339 if (!posix) { 340 token ~= nextchar; 341 state = ' '; 342 break; 343 } else 344 state = 'a'; 345 } else if (posix && !nextchar.isNull && nextchar.get in escape && 346 !state.isNull && state.get in escapedquotes) { 347 escapedstate = state; 348 state = nextchar; 349 } else 350 token ~= nextchar; 351 } else if (!state.isNull && state in escape) { 352 if (nextchar.isNull) { // end of file 353 if (debug_ >= 2) 354 writeln("shlex: I see EOF in escape state"); 355 // XXX what error should be raised here? 356 throw new Exception("No escaped character"); 357 } 358 // In posix shells, only the quote itself or the escape 359 // character may be escaped within quotes. 360 if (escapedstate in quotes && nextchar != state && nextchar != escapedstate) 361 token ~= state; 362 token ~= nextchar; 363 state = escapedstate; 364 } else if (!state.isNull && (state.get == 'a' || state.get == 'c')) { 365 if (nextchar.isNull) { 366 state.nullify(); // end of file 367 break; 368 } else if (nextchar.get in whitespace) { 369 if (debug_ >= 2) 370 writeln("shlex: I see whitespace in word state"); 371 state = ' '; 372 if (token || (posix && quoted)) 373 break; // emit current token 374 else 375 continue; 376 } else if (nextchar.get in commenters) { 377 instream.skipLine(); 378 ++lineno; 379 if (posix) { 380 state = ' '; 381 if (!token.empty || (posix && quoted)) 382 break; // emit current token 383 else 384 continue; 385 } 386 } else if (state == 'c') { 387 if (nextchar.get in punctuationChars) 388 token ~= nextchar; 389 else { 390 if (!nextchar.get in whitespace) 391 _pushbackChars.insertBack(nextchar); 392 state = ' '; 393 break; 394 } 395 } else if (posix && nextchar.get in quotes) 396 state = nextchar; 397 else if (posix && nextchar.get in escape) { 398 escapedstate = 'a'; 399 state = nextchar; 400 } else if (nextchar.get in wordchars || nextchar.get in quotes || whitespaceSplit) { 401 token ~= nextchar; 402 } else { 403 if (punctuationChars.empty) 404 pushback.insertFront(nextchar.get.to!string); 405 else 406 _pushbackChars.insertBack(nextchar); 407 if (debug_ >= 2) 408 writeln("shlex: I see punctuation in word state"); 409 state = ' '; 410 if (!token.empty || (posix && quoted)) 411 break; // emit current token 412 else 413 continue; 414 } 415 } 416 } 417 Nullable!string result = token; 418 //writeln('['~token~']'); 419 token = ""; 420 if (posix && !quoted && result == "") 421 result.nullify(); 422 if (debug_ > 1) { 423 if (!result.isNull && !result.empty) // TODO: can simplify? 424 writeln("shlex: raw token=" ~ result); 425 else 426 writeln("shlex: raw token=EOF"); 427 } 428 return result; 429 } 430 431 /** Hook called on a filename to be sourced.*/ 432 auto sourcehook(string newfile) { 433 if (newfile[0] == '"') 434 newfile = newfile[1..$-1]; 435 // This implements cpp-like semantics for relative-path inclusion. 436 if (!isAbsolute(newfile)) 437 newfile = buildPath(dirName(infile), newfile); 438 return tuple(newfile, new ShlexFile(newfile)); 439 } 440 441 /** Emit a C-compiler-like, Emacs-friendly error-message leader. */ 442 string errorLeader(Nullable!string infile = Nullable!string.init, 443 Nullable!uint lineno=Nullable!uint.init) 444 { 445 if (infile.isNull) 446 infile = this.infile; 447 if (lineno.isNull) 448 lineno = this.lineno; 449 return "\"%s\", line %d: ".format(infile, lineno); 450 } 451 } 452 453 string[] split(string s, Shlex.Comments comments = No.comments, Shlex.Posix posix = Yes.posix) { 454 scope Shlex lex = Shlex(s, Nullable!string.init, posix); // TODO: shorten 455 lex.whitespaceSplit = true; 456 if (!comments) 457 lex.commenters.clear(); 458 return lex.array; 459 } 460 461 unittest { 462 import core.sys.posix.sys.resource; 463 auto limit = rlimit(100*1000000, 100*1000000); 464 setrlimit(RLIMIT_AS, &limit); // prevent OS crash due out of memory 465 466 assert(split("") == []); 467 assert(split("l") == ["l"]); 468 assert(split("ls") == ["ls"]); 469 assert(split("ls -l 'somefile; ls -xz ~'") == ["ls", "-l", "somefile; ls -xz ~"]); 470 assert(split("ssh home 'somefile; ls -xz ~'") == ["ssh", "home", "somefile; ls -xz ~"]); 471 } 472 473 private immutable _findUnsafe = regex(r"[^[a-zA-Z0-9]@%+=:,./-]"); 474 475 /** Return a shell-escaped version of the string *s*. */ 476 string quote(string s) { 477 if (s.empty) 478 return "''"; 479 if (!matchFirst(s, _findUnsafe)) 480 return s; 481 482 // use single quotes, and put single quotes into double quotes 483 // the string $'b is then quoted as '$'"'"'b' 484 return '\'' ~ s.replace("'", "'\"'\"'") ~ '\''; 485 } 486 487 unittest { 488 assert(quote("") == "''"); 489 assert(quote("somefile; ls -xz ~") == "'somefile; ls -xz ~'"); 490 writeln(quote("'") == "''\"'\"''"); // TODO: Too long result (as inherited from the Python library) 491 } 492 493 void _printTokens(Shlex lexer) { 494 while (true) { 495 Nullable!string tt = lexer.getToken(); 496 if (tt.isNull || tt.empty) break; // TODO: can simplify? 497 writeln("Token: " ~ tt); 498 } 499 } 500