Skip to content

Commit 5d570f8

Browse files
committed
[#22] Change parsing engine from regex to PEG (parboiled)
parboiled is PEG (Parsing Expression Grammar) implementation. PEG is more concise than regex, and regex could not handle recursive structure well. GcEventNode is added to access the parsed data easily. Its concrete class is generated by Auto Value library which utilizes annotation processing. IDE setup is required and the instruction can be found here: google/auto#106
1 parent 008ccf7 commit 5d570f8

File tree

8 files changed

+375
-214
lines changed

8 files changed

+375
-214
lines changed

common/src/main/proto/gc_model.proto

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,4 +21,4 @@ message GcEvent {
2121
double sys_time = 7; // time spent in OS call or waiting for system event
2222
double real_time = 8; // (user_time + sys_tim) / threads# + alpha
2323
double ref_time = 9; // reference processing time
24-
}
24+
}

parser/.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
src/generated
2+
src/generated_tests

parser/build.gradle

+13-3
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,14 @@
1+
apply plugin: 'idea'
2+
13
dependencies {
2-
compile project(':common')
3-
compile 'org.apache.commons:commons-lang3:3.4'
4-
}
4+
compile project(':common')
5+
compile 'org.apache.commons:commons-lang3:3.4'
6+
compile 'org.parboiled:parboiled-java:1.1.7'
7+
compile 'com.google.auto.value:auto-value:1.2'
8+
}
9+
10+
idea {
11+
module {
12+
sourceDirs += file("${projectDir}/src/generated")
13+
}
14+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
package edu.kaist.algo.parser;
2+
3+
import org.apache.commons.lang3.StringUtils;
4+
import org.parboiled.BaseParser;
5+
import org.parboiled.Rule;
6+
import org.parboiled.annotations.BuildParseTree;
7+
import org.parboiled.annotations.Label;
8+
import org.parboiled.annotations.SuppressSubnodes;
9+
10+
/**
11+
* PEG (Parsing Expression Grammar) for the CMS GC log.
12+
*
13+
* <p>This PEG is for parsing the one line of CMS GC logs. The line should be complete. That is,
14+
* it should not be cut off by another thread's interference.
15+
*
16+
* <p>Beware: CMS-related logs are not supported yet.
17+
*
18+
* <p>Following options are required:
19+
* <ul>
20+
* <li>-XX:+UseConcMarkSweepGC</li>
21+
* <li>-XX:+UnlockDiagnosticVMOptions</li>
22+
* <li>-XX:+LogVMOutput</li>
23+
* <li>-XX:+PrintGCDetails</li>
24+
* <li>-XX:+PrintGCTimeStamps</li>
25+
* </ul>
26+
*
27+
* <p>PEG (whitespaces are ignored for conciseness):
28+
* <pre>
29+
* InputLine <- Event UserSysRealTimes
30+
* Event <- (Time ': ')? '[' TypeAndDetail (Event)* UsageAndElapsedTime ']'
31+
* Time <- Digits '.' Digits ' secs'?
32+
* Digits <- [0-9]+
33+
* TypeAndDetail <- Type ('(' Detail ')')? ': '?
34+
* Type <- 'GC' / 'ParNew' / 'CMS' / 'Full GC' / 'Metaspace' / '1 CMS-initial-mark'
35+
* / 'YG occupancy' / 'Rescan (parallel)' / 'weak refs processing' / 'class unloading'
36+
* / 'scrub symbol table' / 'scrub string table' / '1 CMS-remark'
37+
* Detail <- 'System.gc()' / !')'+
38+
* UsageAndElapsedTime <- UsageChange? (', ' Event)? (', ' Time)?
39+
* UsageChange <- (Size '-&<span>gt;</span>')? UsageWithTotal
40+
* UsageWithTotal <- Size '(' Size ')'
41+
* Size <- Digits 'K '
42+
* UserSysRealTimes <- '[ Times: user=' Time ' sys=' Time ', real=' Time ']'
43+
* </pre>
44+
*/
45+
@BuildParseTree
46+
public class CmsGcLogRule extends BaseParser<Object> {
47+
48+
Rule InputLine() {
49+
return Sequence(
50+
push(GcEventNode.builder()),
51+
Event(),
52+
UserSysRealTimes(),
53+
push(popAsNode().build())
54+
);
55+
}
56+
57+
Rule Event() {
58+
return Sequence(
59+
Optional(
60+
TimeLong(), ": ",
61+
swap() && push(popAsNode().timestamp(popAsLong()))
62+
),
63+
"[", TypeAndDetail(), " ",
64+
ZeroOrMore(
65+
push(GcEventNode.builder()),
66+
Event(),
67+
swap() && push(popAsNode().addChild(popAsNode().build()))
68+
),
69+
" ", UsageAndElapsedTime(), "] "
70+
);
71+
}
72+
73+
Rule TypeAndDetail() {
74+
return Sequence(
75+
Type(),
76+
push(popAsNode().type(match())),
77+
Optional(" ", "(", Detail(), push(popAsNode().detail(match())), ")"),
78+
Optional(": ")
79+
);
80+
}
81+
82+
@SuppressSubnodes
83+
Rule Type() {
84+
return FirstOf("GC", "ParNew", "CMS", "Full GC", "Metaspace", "1 CMS-initial-mark",
85+
"YG occupancy", "Rescan (parallel)", "weak refs processing", "class unloading",
86+
"scrub symbol table", "scrub string table", "1 CMS-remark");
87+
}
88+
89+
@SuppressSubnodes
90+
Rule Detail() {
91+
return FirstOf("System.gc()", OneOrMore(NoneOf(")")));
92+
}
93+
94+
Rule UsageAndElapsedTime() {
95+
return Sequence(
96+
Optional(UsageChange()),
97+
Optional(", ",
98+
push(GcEventNode.builder()),
99+
Event(), // Metaspace
100+
swap() && push(popAsNode().addChild(popAsNode().build()))
101+
),
102+
Optional(", ",
103+
TimeDouble(),
104+
swap() && push(popAsNode().elapsedTime(popAsDouble()))
105+
)
106+
);
107+
}
108+
109+
Rule UsageChange() {
110+
return Sequence(
111+
Optional(
112+
Size(), "-&gt;",
113+
swap() && push(popAsNode().prevUsage(popAsLong()))
114+
),
115+
UsageWithTotal()
116+
);
117+
}
118+
119+
Rule UsageWithTotal() {
120+
return Sequence(
121+
Size(),
122+
"(", Size(), ")",
123+
swap3() && push(popAsNode().afterUsage(popAsLong())),
124+
push(popAsNode().capacity(popAsLong()))
125+
);
126+
}
127+
128+
Rule Size() {
129+
return Sequence(
130+
Digits(),
131+
push(Long.valueOf(match())),
132+
WhiteSpace(), "K "
133+
);
134+
}
135+
136+
Rule UserSysRealTimes() {
137+
return Sequence(
138+
"[", "Times: ", "user=", TimeDouble(), " sys=", TimeDouble(), ", real=", TimeDouble(), "]",
139+
swap4() && push(popAsNode().user(popAsDouble())),
140+
push(popAsNode().sys(popAsDouble())),
141+
push(popAsNode().real(popAsDouble()))
142+
);
143+
}
144+
145+
@Label("Time")
146+
@SuppressSubnodes
147+
Rule TimeDouble() {
148+
return Sequence(
149+
Sequence(Digits(), ".", Digits()),
150+
push(Double.valueOf(match())),
151+
Optional(" secs")
152+
);
153+
}
154+
155+
@Label("Time")
156+
@SuppressSubnodes
157+
Rule TimeLong() {
158+
return Sequence(
159+
Sequence(Digits(), ".", Digits()),
160+
push(Long.valueOf(StringUtils.remove(match(), ".")))
161+
);
162+
}
163+
164+
@SuppressSubnodes
165+
Rule Digits() {
166+
return OneOrMore(Digit());
167+
}
168+
169+
Rule Digit() {
170+
return CharRange('0', '9');
171+
}
172+
173+
@SuppressSubnodes
174+
Rule WhiteSpace() {
175+
return ZeroOrMore(AnyOf(" \t\f"));
176+
}
177+
178+
@Override
179+
protected Rule fromStringLiteral(String string) {
180+
return string.endsWith(" ")
181+
? Sequence(String(string.substring(0, string.length() - 1)), WhiteSpace())
182+
: String(string);
183+
}
184+
185+
protected Double popAsDouble() {
186+
return (Double) pop();
187+
}
188+
189+
protected GcEventNode.Builder popAsNode() {
190+
return (GcEventNode.Builder) pop();
191+
}
192+
193+
protected Long popAsLong() {
194+
return (Long) pop();
195+
}
196+
}

0 commit comments

Comments
 (0)