Class | RubyLex |
In: |
parsers/parse_rb.rb
|
Parent: | Object |
Lexical analyzer for Ruby source
ENINDENT_CLAUSE | = | [ "case", "class", "def", "do", "for", "if", "module", "unless", "until", "while", "begin" |
DEINDENT_CLAUSE | = | ["end" |
PERCENT_LTYPE | = | { "q" => "\'", "Q" => "\"", "x" => "\`", "r" => "\/", "w" => "]" |
PERCENT_PAREN | = | { "{" => "}", "[" => "]", "<" => ">", "(" => ")" |
Ltype2Token | = | { "\'" => TkSTRING, "\"" => TkSTRING, "\`" => TkXSTRING, "\/" => TkREGEXP, "]" => TkDSTRING |
DLtype2Token | = | { "\"" => TkDSTRING, "\`" => TkDXSTRING, "\/" => TkDREGEXP, } |
continue | [R] | |
exception_on_syntax_error | [RW] | |
indent | [R] | |
lex_state | [R] | |
read_auto_clean_up | [RW] | |
skip_space | [RW] |
# File parsers/parse_rb.rb, line 447 447: def initialize(content) 448: lex_init 449: 450: @reader = BufferedReader.new(content) 451: 452: @exp_line_no = @line_no = 1 453: @base_char_no = 0 454: @indent = 0 455: 456: @ltype = nil 457: @quoted = nil 458: @lex_state = EXPR_BEG 459: @space_seen = false 460: 461: @continue = false 462: @line = "" 463: 464: @skip_space = false 465: @read_auto_clean_up = false 466: @exception_on_syntax_error = true 467: end
# File parsers/parse_rb.rb, line 496 496: def gets 497: c = getc or return 498: l = "" 499: begin 500: l.concat c unless c == "\r" 501: break if c == "\n" 502: end while c = getc 503: l 504: end
# File parsers/parse_rb.rb, line 1270 1270: def identify_comment 1271: @ltype = "#" 1272: comment = "#" 1273: while ch = getc 1274: if ch == "\\" 1275: ch = getc 1276: if ch == "\n" 1277: ch = " " 1278: else 1279: comment << "\\" 1280: end 1281: else 1282: if ch == "\n" 1283: @ltype = nil 1284: ungetc 1285: break 1286: end 1287: end 1288: comment << ch 1289: end 1290: return Token(TkCOMMENT).set_text(comment) 1291: end
# File parsers/parse_rb.rb, line 965 965: def identify_gvar 966: @lex_state = EXPR_END 967: str = "$" 968: 969: tk = case ch = getc 970: when /[~_*$?!@\/\\;,=:<>".]/ #" 971: str << ch 972: Token(TkGVAR, str) 973: 974: when "-" 975: str << "-" << getc 976: Token(TkGVAR, str) 977: 978: when "&", "`", "'", "+" 979: str << ch 980: Token(TkBACK_REF, str) 981: 982: when /[1-9]/ 983: str << ch 984: while (ch = getc) =~ /[0-9]/ 985: str << ch 986: end 987: ungetc 988: Token(TkNTH_REF) 989: when /\w/ 990: ungetc 991: ungetc 992: return identify_identifier 993: else 994: ungetc 995: Token("$") 996: end 997: tk.set_text(str) 998: end
# File parsers/parse_rb.rb, line 1075 1075: def identify_here_document 1076: ch = getc 1077: if ch == "-" 1078: ch = getc 1079: indent = true 1080: end 1081: if /['"`]/ =~ ch # ' 1082: lt = ch 1083: quoted = "" 1084: while (c = getc) && c != lt 1085: quoted.concat c 1086: end 1087: else 1088: lt = '"' 1089: quoted = ch.dup 1090: while (c = getc) && c =~ /\w/ 1091: quoted.concat c 1092: end 1093: ungetc 1094: end 1095: 1096: ltback, @ltype = @ltype, lt 1097: reserve = "" 1098: 1099: while ch = getc 1100: reserve << ch 1101: if ch == "\\" #" 1102: ch = getc 1103: reserve << ch 1104: elsif ch == "\n" 1105: break 1106: end 1107: end 1108: 1109: str = "" 1110: while (l = gets) 1111: l.chomp! 1112: l.strip! if indent 1113: break if l == quoted 1114: str << l.chomp << "\n" 1115: end 1116: 1117: @reader.divert_read_from(reserve) 1118: 1119: @ltype = ltback 1120: @lex_state = EXPR_END 1121: Token(Ltype2Token[lt], str).set_text(str.dump) 1122: end
# File parsers/parse_rb.rb, line 1000 1000: def identify_identifier 1001: token = "" 1002: token.concat getc if peek(0) =~ /[$@]/ 1003: token.concat getc if peek(0) == "@" 1004: 1005: while (ch = getc) =~ /\w|_/ 1006: print ":", ch, ":" if RubyLex.debug? 1007: token.concat ch 1008: end 1009: ungetc 1010: 1011: if ch == "!" or ch == "?" 1012: token.concat getc 1013: end 1014: # fix token 1015: 1016: # $stderr.puts "identifier - #{token}, state = #@lex_state" 1017: 1018: case token 1019: when /^\$/ 1020: return Token(TkGVAR, token).set_text(token) 1021: when /^\@/ 1022: @lex_state = EXPR_END 1023: return Token(TkIVAR, token).set_text(token) 1024: end 1025: 1026: if @lex_state != EXPR_DOT 1027: print token, "\n" if RubyLex.debug? 1028: 1029: token_c, *trans = TkReading2Token[token] 1030: if token_c 1031: # reserved word? 1032: 1033: if (@lex_state != EXPR_BEG && 1034: @lex_state != EXPR_FNAME && 1035: trans[1]) 1036: # modifiers 1037: token_c = TkSymbol2Token[trans[1]] 1038: @lex_state = trans[0] 1039: else 1040: if @lex_state != EXPR_FNAME 1041: if ENINDENT_CLAUSE.include?(token) 1042: @indent += 1 1043: elsif DEINDENT_CLAUSE.include?(token) 1044: @indent -= 1 1045: end 1046: @lex_state = trans[0] 1047: else 1048: @lex_state = EXPR_END 1049: end 1050: end 1051: return Token(token_c, token).set_text(token) 1052: end 1053: end 1054: 1055: if @lex_state == EXPR_FNAME 1056: @lex_state = EXPR_END 1057: if peek(0) == '=' 1058: token.concat getc 1059: end 1060: elsif @lex_state == EXPR_BEG || @lex_state == EXPR_DOT 1061: @lex_state = EXPR_ARG 1062: else 1063: @lex_state = EXPR_END 1064: end 1065: 1066: if token[0, 1] =~ /[A-Z]/ 1067: return Token(TkCONSTANT, token).set_text(token) 1068: elsif token[token.size - 1, 1] =~ /[!?]/ 1069: return Token(TkFID, token).set_text(token) 1070: else 1071: return Token(TkIDENTIFIER, token).set_text(token) 1072: end 1073: end
# File parsers/parse_rb.rb, line 1143 1143: def identify_number(start) 1144: str = start.dup 1145: 1146: if start == "+" or start == "-" or start == "" 1147: start = getc 1148: str << start 1149: end 1150: 1151: @lex_state = EXPR_END 1152: 1153: if start == "0" 1154: if peek(0) == "x" 1155: ch = getc 1156: str << ch 1157: match = /[0-9a-f_]/ 1158: else 1159: match = /[0-7_]/ 1160: end 1161: while ch = getc 1162: if ch !~ match 1163: ungetc 1164: break 1165: else 1166: str << ch 1167: end 1168: end 1169: return Token(TkINTEGER).set_text(str) 1170: end 1171: 1172: type = TkINTEGER 1173: allow_point = TRUE 1174: allow_e = TRUE 1175: while ch = getc 1176: case ch 1177: when /[0-9_]/ 1178: str << ch 1179: 1180: when allow_point && "." 1181: type = TkFLOAT 1182: if peek(0) !~ /[0-9]/ 1183: ungetc 1184: break 1185: end 1186: str << ch 1187: allow_point = false 1188: 1189: when allow_e && "e", allow_e && "E" 1190: str << ch 1191: type = TkFLOAT 1192: if peek(0) =~ /[+-]/ 1193: str << getc 1194: end 1195: allow_e = false 1196: allow_point = false 1197: else 1198: ungetc 1199: break 1200: end 1201: end 1202: Token(type).set_text(str) 1203: end
# File parsers/parse_rb.rb, line 1124 1124: def identify_quotation(initial_char) 1125: ch = getc 1126: if lt = PERCENT_LTYPE[ch] 1127: initial_char += ch 1128: ch = getc 1129: elsif ch =~ /\W/ 1130: lt = "\"" 1131: else 1132: RubyLex.fail SyntaxError, "unknown type of %string ('#{ch}')" 1133: end 1134: # if ch !~ /\W/ 1135: # ungetc 1136: # next 1137: # end 1138: #@ltype = lt 1139: @quoted = ch unless @quoted = PERCENT_PAREN[ch] 1140: identify_string(lt, @quoted, ch, initial_char) 1141: end
# File parsers/parse_rb.rb, line 1205 1205: def identify_string(ltype, quoted = ltype, opener=nil, initial_char = nil) 1206: @ltype = ltype 1207: @quoted = quoted 1208: subtype = nil 1209: 1210: str = "" 1211: str << initial_char if initial_char 1212: str << (opener||quoted) 1213: 1214: nest = 0 1215: begin 1216: while ch = getc 1217: str << ch 1218: if @quoted == ch 1219: if nest == 0 1220: break 1221: else 1222: nest -= 1 1223: end 1224: elsif opener == ch 1225: nest += 1 1226: elsif @ltype != "'" && @ltype != "]" and ch == "#" 1227: ch = getc 1228: if ch == "{" 1229: subtype = true 1230: str << ch << skip_inner_expression 1231: else 1232: ungetc(ch) 1233: end 1234: elsif ch == '\\' #' 1235: str << read_escape 1236: end 1237: end 1238: if @ltype == "/" 1239: if peek(0) =~ /i|o|n|e|s/ 1240: str << getc 1241: end 1242: end 1243: if subtype 1244: Token(DLtype2Token[ltype], str) 1245: else 1246: Token(Ltype2Token[ltype], str) 1247: end.set_text(str) 1248: ensure 1249: @ltype = nil 1250: @quoted = nil 1251: @lex_state = EXPR_END 1252: end 1253: end
# File parsers/parse_rb.rb, line 519 519: def lex 520: until (((tk = token).kind_of?(TkNL) || tk.kind_of?(TkEND_OF_SCRIPT)) && 521: !@continue or 522: tk.nil?) 523: end 524: line = get_read 525: 526: if line == "" and tk.kind_of?(TkEND_OF_SCRIPT) || tk.nil? 527: nil 528: else 529: line 530: end 531: end
# File parsers/parse_rb.rb, line 589 589: def lex_init() 590: @OP = SLex.new 591: # @OP = IRB::SLex.new # 1.8 doesn't support #IRB::SLex 592: @OP.def_rules("\0", "\004", "\032") do |chars, io| 593: Token(TkEND_OF_SCRIPT).set_text(chars) 594: end 595: 596: @OP.def_rules(" ", "\t", "\f", "\r", "\13") do |chars, io| 597: @space_seen = TRUE 598: while (ch = getc) =~ /[ \t\f\r\13]/ 599: chars << ch 600: end 601: ungetc 602: Token(TkSPACE).set_text(chars) 603: end 604: 605: @OP.def_rule("#") do 606: |op, io| 607: identify_comment 608: end 609: 610: @OP.def_rule("=begin", proc{@prev_char_no == 0 && peek(0) =~ /\s/}) do 611: |op, io| 612: str = op 613: @ltype = "=" 614: 615: 616: begin 617: line = "" 618: begin 619: ch = getc 620: line << ch 621: end until ch == "\n" 622: str << line 623: end until line =~ /^=end/ 624: 625: ungetc 626: 627: @ltype = nil 628: 629: if str =~ /\A=begin\s+rdoc/i 630: str.sub!(/\A=begin.*\n/, '') 631: str.sub!(/^=end.*/m, '') 632: Token(TkCOMMENT).set_text(str) 633: else 634: Token(TkRD_COMMENT)#.set_text(str) 635: end 636: end 637: 638: @OP.def_rule("\n") do 639: print "\\n\n" if RubyLex.debug? 640: case @lex_state 641: when EXPR_BEG, EXPR_FNAME, EXPR_DOT 642: @continue = TRUE 643: else 644: @continue = FALSE 645: @lex_state = EXPR_BEG 646: end 647: Token(TkNL).set_text("\n") 648: end 649: 650: @OP.def_rules("*", "**", 651: "!", "!=", "!~", 652: "=", "==", "===", 653: "=~", "<=>", 654: "<", "<=", 655: ">", ">=", ">>") do 656: |op, io| 657: @lex_state = EXPR_BEG 658: Token(op).set_text(op) 659: end 660: 661: @OP.def_rules("<<") do 662: |op, io| 663: tk = nil 664: if @lex_state != EXPR_END && @lex_state != EXPR_CLASS && 665: (@lex_state != EXPR_ARG || @space_seen) 666: c = peek(0) 667: if /[-\w_\"\'\`]/ =~ c 668: tk = identify_here_document 669: end 670: end 671: if !tk 672: @lex_state = EXPR_BEG 673: tk = Token(op).set_text(op) 674: end 675: tk 676: end 677: 678: @OP.def_rules("'", '"') do 679: |op, io| 680: identify_string(op) 681: end 682: 683: @OP.def_rules("`") do 684: |op, io| 685: if @lex_state == EXPR_FNAME 686: Token(op).set_text(op) 687: else 688: identify_string(op) 689: end 690: end 691: 692: @OP.def_rules('?') do 693: |op, io| 694: if @lex_state == EXPR_END 695: @lex_state = EXPR_BEG 696: Token(TkQUESTION).set_text(op) 697: else 698: ch = getc 699: if @lex_state == EXPR_ARG && ch !~ /\s/ 700: ungetc 701: @lex_state = EXPR_BEG; 702: Token(TkQUESTION).set_text(op) 703: else 704: str = op 705: str << ch 706: if (ch == '\\') #' 707: str << read_escape 708: end 709: @lex_state = EXPR_END 710: Token(TkINTEGER).set_text(str) 711: end 712: end 713: end 714: 715: @OP.def_rules("&", "&&", "|", "||") do 716: |op, io| 717: @lex_state = EXPR_BEG 718: Token(op).set_text(op) 719: end 720: 721: @OP.def_rules("+=", "-=", "*=", "**=", 722: "&=", "|=", "^=", "<<=", ">>=", "||=", "&&=") do 723: |op, io| 724: @lex_state = EXPR_BEG 725: op =~ /^(.*)=$/ 726: Token(TkOPASGN, $1).set_text(op) 727: end 728: 729: @OP.def_rule("+@", proc{@lex_state == EXPR_FNAME}) do |op, io| 730: Token(TkUPLUS).set_text(op) 731: end 732: 733: @OP.def_rule("-@", proc{@lex_state == EXPR_FNAME}) do |op, io| 734: Token(TkUMINUS).set_text(op) 735: end 736: 737: @OP.def_rules("+", "-") do 738: |op, io| 739: catch(:RET) do 740: if @lex_state == EXPR_ARG 741: if @space_seen and peek(0) =~ /[0-9]/ 742: throw :RET, identify_number(op) 743: else 744: @lex_state = EXPR_BEG 745: end 746: elsif @lex_state != EXPR_END and peek(0) =~ /[0-9]/ 747: throw :RET, identify_number(op) 748: else 749: @lex_state = EXPR_BEG 750: end 751: Token(op).set_text(op) 752: end 753: end 754: 755: @OP.def_rule(".") do 756: @lex_state = EXPR_BEG 757: if peek(0) =~ /[0-9]/ 758: ungetc 759: identify_number("") 760: else 761: # for obj.if 762: @lex_state = EXPR_DOT 763: Token(TkDOT).set_text(".") 764: end 765: end 766: 767: @OP.def_rules("..", "...") do 768: |op, io| 769: @lex_state = EXPR_BEG 770: Token(op).set_text(op) 771: end 772: 773: lex_int2 774: end
# File parsers/parse_rb.rb, line 776 776: def lex_int2 777: @OP.def_rules("]", "}", ")") do 778: |op, io| 779: @lex_state = EXPR_END 780: @indent -= 1 781: Token(op).set_text(op) 782: end 783: 784: @OP.def_rule(":") do 785: if @lex_state == EXPR_END || peek(0) =~ /\s/ 786: @lex_state = EXPR_BEG 787: tk = Token(TkCOLON) 788: else 789: @lex_state = EXPR_FNAME; 790: tk = Token(TkSYMBEG) 791: end 792: tk.set_text(":") 793: end 794: 795: @OP.def_rule("::") do 796: # p @lex_state.id2name, @space_seen 797: if @lex_state == EXPR_BEG or @lex_state == EXPR_ARG && @space_seen 798: @lex_state = EXPR_BEG 799: tk = Token(TkCOLON3) 800: else 801: @lex_state = EXPR_DOT 802: tk = Token(TkCOLON2) 803: end 804: tk.set_text("::") 805: end 806: 807: @OP.def_rule("/") do 808: |op, io| 809: if @lex_state == EXPR_BEG || @lex_state == EXPR_MID 810: identify_string(op) 811: elsif peek(0) == '=' 812: getc 813: @lex_state = EXPR_BEG 814: Token(TkOPASGN, :/).set_text("/=") #") 815: elsif @lex_state == EXPR_ARG and @space_seen and peek(0) !~ /\s/ 816: identify_string(op) 817: else 818: @lex_state = EXPR_BEG 819: Token("/").set_text(op) 820: end 821: end 822: 823: @OP.def_rules("^") do 824: @lex_state = EXPR_BEG 825: Token("^").set_text("^") 826: end 827: 828: # @OP.def_rules("^=") do 829: # @lex_state = EXPR_BEG 830: # Token(TkOPASGN, :^) 831: # end 832: 833: @OP.def_rules(",", ";") do 834: |op, io| 835: @lex_state = EXPR_BEG 836: Token(op).set_text(op) 837: end 838: 839: @OP.def_rule("~") do 840: @lex_state = EXPR_BEG 841: Token("~").set_text("~") 842: end 843: 844: @OP.def_rule("~@", proc{@lex_state = EXPR_FNAME}) do 845: @lex_state = EXPR_BEG 846: Token("~").set_text("~@") 847: end 848: 849: @OP.def_rule("(") do 850: @indent += 1 851: if @lex_state == EXPR_BEG || @lex_state == EXPR_MID 852: @lex_state = EXPR_BEG 853: tk = Token(TkfLPAREN) 854: else 855: @lex_state = EXPR_BEG 856: tk = Token(TkLPAREN) 857: end 858: tk.set_text("(") 859: end 860: 861: @OP.def_rule("[]", proc{@lex_state == EXPR_FNAME}) do 862: Token("[]").set_text("[]") 863: end 864: 865: @OP.def_rule("[]=", proc{@lex_state == EXPR_FNAME}) do 866: Token("[]=").set_text("[]=") 867: end 868: 869: @OP.def_rule("[") do 870: @indent += 1 871: if @lex_state == EXPR_FNAME 872: t = Token(TkfLBRACK) 873: else 874: if @lex_state == EXPR_BEG || @lex_state == EXPR_MID 875: t = Token(TkLBRACK) 876: elsif @lex_state == EXPR_ARG && @space_seen 877: t = Token(TkLBRACK) 878: else 879: t = Token(TkfLBRACK) 880: end 881: @lex_state = EXPR_BEG 882: end 883: t.set_text("[") 884: end 885: 886: @OP.def_rule("{") do 887: @indent += 1 888: if @lex_state != EXPR_END && @lex_state != EXPR_ARG 889: t = Token(TkLBRACE) 890: else 891: t = Token(TkfLBRACE) 892: end 893: @lex_state = EXPR_BEG 894: t.set_text("{") 895: end 896: 897: @OP.def_rule('\\') do #' 898: if getc == "\n" 899: @space_seen = true 900: @continue = true 901: Token(TkSPACE).set_text("\\\n") 902: else 903: ungetc 904: Token("\\").set_text("\\") #" 905: end 906: end 907: 908: @OP.def_rule('%') do 909: |op, io| 910: if @lex_state == EXPR_BEG || @lex_state == EXPR_MID 911: identify_quotation('%') 912: elsif peek(0) == '=' 913: getc 914: Token(TkOPASGN, "%").set_text("%=") 915: elsif @lex_state == EXPR_ARG and @space_seen and peek(0) !~ /\s/ 916: identify_quotation('%') 917: else 918: @lex_state = EXPR_BEG 919: Token("%").set_text("%") 920: end 921: end 922: 923: @OP.def_rule('$') do #' 924: identify_gvar 925: end 926: 927: @OP.def_rule('@') do 928: if peek(0) =~ /[@\w_]/ 929: ungetc 930: identify_identifier 931: else 932: Token("@").set_text("@") 933: end 934: end 935: 936: # @OP.def_rule("def", proc{|op, io| /\s/ =~ io.peek(0)}) do 937: # |op, io| 938: # @indent += 1 939: # @lex_state = EXPR_FNAME 940: # # @lex_state = EXPR_END 941: # # until @rests[0] == "\n" or @rests[0] == ";" 942: # # rests.shift 943: # # end 944: # end 945: 946: @OP.def_rule("__END__", proc{@prev_char_no == 0 && peek(0) =~ /[\r\n]/}) do 947: throw :eof 948: end 949: 950: @OP.def_rule("") do 951: |op, io| 952: printf "MATCH: start %s: %s\n", op, io.inspect if RubyLex.debug? 953: if peek(0) =~ /[0-9]/ 954: t = identify_number("") 955: elsif peek(0) =~ /[\w_]/ 956: t = identify_identifier 957: end 958: printf "MATCH: end %s: %s\n", op, io.inspect if RubyLex.debug? 959: t 960: end 961: 962: p @OP if RubyLex.debug? 963: end
# File parsers/parse_rb.rb, line 511 511: def peek_equal?(str) 512: @reader.peek_equal(str) 513: end
# File parsers/parse_rb.rb, line 1293 1293: def read_escape 1294: res = "" 1295: case ch = getc 1296: when /[0-7]/ 1297: ungetc ch 1298: 3.times do 1299: case ch = getc 1300: when /[0-7]/ 1301: when nil 1302: break 1303: else 1304: ungetc 1305: break 1306: end 1307: res << ch 1308: end 1309: 1310: when "x" 1311: res << ch 1312: 2.times do 1313: case ch = getc 1314: when /[0-9a-fA-F]/ 1315: when nil 1316: break 1317: else 1318: ungetc 1319: break 1320: end 1321: res << ch 1322: end 1323: 1324: when "M" 1325: res << ch 1326: if (ch = getc) != '-' 1327: ungetc 1328: else 1329: res << ch 1330: if (ch = getc) == "\\" #" 1331: res << ch 1332: res << read_escape 1333: else 1334: res << ch 1335: end 1336: end 1337: 1338: when "C", "c", "^" 1339: res << ch 1340: if ch == "C" and (ch = getc) != "-" 1341: ungetc 1342: else 1343: res << ch 1344: if (ch = getc) == "\\" #" 1345: res << ch 1346: res << read_escape 1347: else 1348: res << ch 1349: end 1350: end 1351: else 1352: res << ch 1353: end 1354: res 1355: end
# File parsers/parse_rb.rb, line 1255 1255: def skip_inner_expression 1256: res = "" 1257: nest = 0 1258: while (ch = getc) 1259: res << ch 1260: if ch == '}' 1261: break if nest.zero? 1262: nest -= 1 1263: elsif ch == '{' 1264: nest += 1 1265: end 1266: end 1267: res 1268: end
# File parsers/parse_rb.rb, line 533 533: def token 534: set_token_position(line_no, char_no) 535: begin 536: begin 537: tk = @OP.match(self) 538: @space_seen = tk.kind_of?(TkSPACE) 539: rescue SyntaxError 540: abort if @exception_on_syntax_error 541: tk = TkError.new(line_no, char_no) 542: end 543: end while @skip_space and tk.kind_of?(TkSPACE) 544: if @read_auto_clean_up 545: get_read 546: end 547: # throw :eof unless tk 548: p tk if $DEBUG 549: tk 550: end