Skip to content

Commit

Permalink
Add a node containment option to semgrex that works on CoreAnnotation…
Browse files Browse the repository at this point in the history
…s which are Maps.

Currently the syntax is @, subject to change

eg,

morphofeatures@foo=bar

In this expression, bar can be a regex, but foo and morphofeatures cannot.
It might be worth adding regex capabilities for both of those,
Also, !@ would be a useful addition.

This checks at Semgrex compile time (not Java compile time) that
the annotation used for key/value is actually a Map

Has a test that the error checking and a simple search both work.
  • Loading branch information
AngledLuffa committed Feb 27, 2025
1 parent dcee001 commit 2fce986
Show file tree
Hide file tree
Showing 5 changed files with 219 additions and 58 deletions.
16 changes: 16 additions & 0 deletions src/edu/stanford/nlp/semgraph/semgrex/NodeAttributes.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,20 @@ public class NodeAttributes {
// String, String, Boolean: key, value, negated
private List<Triple<String, String, Boolean>> attributes;
private Set<String> positiveAttributes;
// Some annotations, especially morpho freatures (CoreAnnotations.CoNLLUFeats)
// are represented by Maps. In some cases it will be easier to search
// for individual elements of that map rather than turn the map into a string
// and search on its contents that way. This is especially true since there
// is no guarantee the map will be in a consistent order.
// String, String, String: node attribute for a map (such as CoNLLUFeats), key in that map, value to match
private List<Triple<String, String, String>> contains;

public NodeAttributes() {
root = false;
empty = false;
attributes = new ArrayList<>();
positiveAttributes = new HashSet<>();
contains = new ArrayList<>();
}

public void setRoot(boolean root) {
Expand Down Expand Up @@ -60,7 +68,15 @@ public void setAttribute(String key, String value, boolean negated) {
attributes.add(new Triple(key, value, negated));
}

public void addContains(String annotation, String key, String value) {
contains.add(new Triple(annotation, key, value));
}

public List<Triple<String, String, Boolean>> attributes() {
return Collections.unmodifiableList(attributes);
}

public List<Triple<String, String, String>> contains() {
return Collections.unmodifiableList(contains);
}
}
101 changes: 84 additions & 17 deletions src/edu/stanford/nlp/semgraph/semgrex/NodePattern.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import edu.stanford.nlp.ling.AnnotationLookup;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphEdge;
Expand All @@ -32,6 +33,11 @@ public class NodePattern extends SemgrexPattern {
* Otherwise, the type will be a Pattern, and you must use Pattern.matches().
*/
private final List<Attribute> attributes;
/**
* Attributes which represent Maps (eg CoNLLUFeats)
* and only partial matches are necessary
*/
private final List<Pair<String, Attribute>> partialAttributes;
private final boolean isRoot;
private final boolean isLink;
private final boolean isEmpty;
Expand All @@ -58,6 +64,9 @@ public NodePattern(GraphRelation r, boolean negDesc,
// order the attributes so that the pattern stays the same when
// printing a compiled pattern
this.attributes = new ArrayList<>();
// same with partial attributes
this.partialAttributes = new ArrayList<>();

descString = "{";
for (Triple<String, String, Boolean> entry : attrs.attributes()) {
if (!descString.equals("{"))
Expand All @@ -70,23 +79,7 @@ public NodePattern(GraphRelation r, boolean negDesc,
if (value.equals("__")) {
attributes.add(new Attribute(key, true, true, negated));
} else if (value.matches("/.*/")) {
boolean isRegexp = false;
for (int i = 1; i < value.length() - 1; ++i) {
char chr = value.charAt(i);
if ( !( (chr >= 'A' && chr <= 'Z') || (chr >= 'a' && chr <= 'z') || (chr >= '0' && chr <= '9') ) ) {
isRegexp = true;
break;
}
}
String patternContent = value.substring(1, value.length() - 1);
if (isRegexp) {
attributes.add(new Attribute(key,
Pattern.compile(patternContent),
Pattern.compile(patternContent, Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE),
negated));
} else {
attributes.add(new Attribute(key, patternContent, patternContent, negated));
}
attributes.add(buildRegexAttribute(key, value, negated));
} else { // raw description
attributes.add(new Attribute(key, value, value, negated));
}
Expand All @@ -98,6 +91,33 @@ public NodePattern(GraphRelation r, boolean negDesc,
}
}

for (Triple<String, String, String> entry : attrs.contains()) {
String annotation = entry.first();
String key = entry.second();
String value = entry.third();

Class<?> clazz = AnnotationLookup.getValueType(AnnotationLookup.toCoreKey(annotation));
boolean isMap = clazz != null && Map.class.isAssignableFrom(clazz);
if (!isMap) {
throw new SemgrexParseException("Cannot process a single key/value from annotation " + annotation + " as it is not a Map");
}

final Attribute attr;
// Add the attributes for this key
if (value.equals("__")) {
attr = new Attribute(key, true, true, false);
} else if (value.matches("/.*/")) {
attr = buildRegexAttribute(key, value, false);
} else { // raw description
attr = new Attribute(key, value, value, false);
}
partialAttributes.add(new Pair<>(annotation, attr));

if (!descString.equals("{"))
descString += ";";
descString += (annotation + "@" + key + "=" + value);
}

if (attrs.root()) {
if (!descString.equals("{"))
descString += ";";
Expand All @@ -118,6 +138,30 @@ public NodePattern(GraphRelation r, boolean negDesc,
this.variableGroups = Collections.unmodifiableList(variableGroups);
}

/**
* Tests the value to see if it's really a regex, or just a string wrapped in regex.
* Return an Attribute which matches this expression
*/
private Attribute buildRegexAttribute(String key, String value, boolean negated) {
boolean isRegexp = false;
for (int i = 1; i < value.length() - 1; ++i) {
char chr = value.charAt(i);
if ( !( (chr >= 'A' && chr <= 'Z') || (chr >= 'a' && chr <= 'z') || (chr >= '0' && chr <= '9') ) ) {
isRegexp = true;
break;
}
}
String patternContent = value.substring(1, value.length() - 1);
if (isRegexp) {
return new Attribute(key,
Pattern.compile(patternContent),
Pattern.compile(patternContent, Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE),
negated);
} else {
return new Attribute(key, patternContent, patternContent, negated);
}
}

private boolean checkMatch(Attribute attr, boolean ignoreCase, String nodeValue) {
if (nodeValue == null) {
// treat non-existent attributes has having matched a negated expression
Expand Down Expand Up @@ -189,6 +233,29 @@ public boolean nodeAttrMatch(IndexedWord node, final SemanticGraph sg, boolean i
return negDesc;
}
}
for (Pair<String, Attribute> partialAttribute : partialAttributes) {
String annotation = partialAttribute.first();
Attribute attr = partialAttribute.second();

Class clazz = Env.lookupAnnotationKey(env, annotation);
Object rawmap = node.get(clazz);
// if the map is null, it can't possibly match...
if (rawmap == null) {
return negDesc;
}
if (!(rawmap instanceof Map))
throw new RuntimeException("Can only use partial attributes with Maps... this should have been checked at creation time!");
Map<String, ?> map = (Map) rawmap;

// TODO: allow for regex match on the keys?
Object value = map.get(attr.key);
final String nodeValue = (value == null) ? null : value.toString();
boolean matches = checkMatch(attr, ignoreCase, nodeValue);
if (!matches) {
return negDesc;
}
}

// System.out.println("matches");
// System.out.println("");
return !negDesc;
Expand Down
Loading

0 comments on commit 2fce986

Please sign in to comment.