001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.filter;
019
020import java.nio.charset.Charset;
021import java.nio.charset.IllegalCharsetNameException;
022import java.util.Arrays;
023import java.util.regex.Pattern;
024import org.apache.hadoop.hbase.exceptions.DeserializationException;
025import org.apache.hadoop.hbase.util.Bytes;
026import org.apache.yetus.audience.InterfaceAudience;
027import org.jcodings.Encoding;
028import org.jcodings.EncodingDB;
029import org.jcodings.specific.NonStrictUTF8Encoding;
030import org.joni.Matcher;
031import org.joni.Option;
032import org.joni.Regex;
033import org.joni.Syntax;
034import org.slf4j.Logger;
035import org.slf4j.LoggerFactory;
036
037import org.apache.hbase.thirdparty.com.google.protobuf.InvalidProtocolBufferException;
038
039import org.apache.hadoop.hbase.shaded.protobuf.generated.ComparatorProtos;
040
041/**
042 * This comparator is for use with {@link CompareFilter} implementations, such as {@link RowFilter},
043 * {@link QualifierFilter}, and {@link ValueFilter}, for filtering based on the value of a given
044 * column. Use it to test if a given regular expression matches a cell value in the column.
045 * <p>
046 * Only EQUAL or NOT_EQUAL comparisons are valid with this comparator.
047 * <p>
048 * For example:
049 * <p>
050 *
051 * <pre>
052 * ValueFilter vf = new ValueFilter(CompareOp.EQUAL, new RegexStringComparator(
053 *   // v4 IP address
054 *   "(((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3,3}"
055 *     + "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))(\\/[0-9]+)?" + "|" +
056 *     // v6 IP address
057 *     "((([\\dA-Fa-f]{1,4}:){7}[\\dA-Fa-f]{1,4})(:([\\d]{1,3}.)"
058 *     + "{3}[\\d]{1,3})?)(\\/[0-9]+)?"));
059 * </pre>
060 * <p>
061 * Supports {@link java.util.regex.Pattern} flags as well:
062 * <p>
063 *
064 * <pre>
065 * ValueFilter vf = new ValueFilter(CompareOp.EQUAL,
066 *   new RegexStringComparator("regex", Pattern.CASE_INSENSITIVE | Pattern.DOTALL));
067 * </pre>
068 *
069 * @see java.util.regex.Pattern
070 */
071@InterfaceAudience.Public
072@SuppressWarnings("ComparableType") // Should this move to Comparator usage?
073public class RegexStringComparator extends ByteArrayComparable {
074
075  private static final Logger LOG = LoggerFactory.getLogger(RegexStringComparator.class);
076
077  private Engine engine;
078
079  /** Engine implementation type (default=JAVA) */
080  @InterfaceAudience.Public
081  public enum EngineType {
082    JAVA,
083    JONI
084  }
085
086  /**
087   * Constructor Adds Pattern.DOTALL to the underlying Pattern
088   * @param expr a valid regular expression
089   */
090  public RegexStringComparator(String expr) {
091    this(expr, Pattern.DOTALL);
092  }
093
094  /**
095   * Constructor Adds Pattern.DOTALL to the underlying Pattern
096   * @param expr   a valid regular expression
097   * @param engine engine implementation type
098   */
099  public RegexStringComparator(String expr, EngineType engine) {
100    this(expr, Pattern.DOTALL, engine);
101  }
102
103  /**
104   * Constructor
105   * @param expr  a valid regular expression
106   * @param flags java.util.regex.Pattern flags
107   */
108  public RegexStringComparator(String expr, int flags) {
109    this(expr, flags, EngineType.JAVA);
110  }
111
112  /**
113   * Constructor
114   * @param expr   a valid regular expression
115   * @param flags  java.util.regex.Pattern flags
116   * @param engine engine implementation type
117   */
118  public RegexStringComparator(String expr, int flags, EngineType engine) {
119    super(Bytes.toBytes(expr));
120    switch (engine) {
121      case JAVA:
122        this.engine = new JavaRegexEngine(expr, flags);
123        break;
124      case JONI:
125        this.engine = new JoniRegexEngine(expr, flags);
126        break;
127    }
128  }
129
130  /**
131   * Specifies the {@link Charset} to use to convert the row key to a String.
132   * <p>
133   * The row key needs to be converted to a String in order to be matched against the regular
134   * expression. This method controls which charset is used to do this conversion.
135   * <p>
136   * If the row key is made of arbitrary bytes, the charset {@code ISO-8859-1} is recommended.
137   * @param charset The charset to use.
138   */
139  public void setCharset(final Charset charset) {
140    engine.setCharset(charset.name());
141  }
142
143  @Override
144  public int compareTo(byte[] value, int offset, int length) {
145    return engine.compareTo(value, offset, length);
146  }
147
148  /** Returns The comparator serialized using pb */
149  @Override
150  public byte[] toByteArray() {
151    return engine.toByteArray();
152  }
153
154  /**
155   * @param pbBytes A pb serialized {@link RegexStringComparator} instance
156   * @return An instance of {@link RegexStringComparator} made from <code>bytes</code>
157   * @see #toByteArray
158   */
159  public static RegexStringComparator parseFrom(final byte[] pbBytes)
160    throws DeserializationException {
161    ComparatorProtos.RegexStringComparator proto;
162    try {
163      proto = ComparatorProtos.RegexStringComparator.parseFrom(pbBytes);
164    } catch (InvalidProtocolBufferException e) {
165      throw new DeserializationException(e);
166    }
167    RegexStringComparator comparator;
168    if (proto.hasEngine()) {
169      EngineType engine = EngineType.valueOf(proto.getEngine());
170      comparator = new RegexStringComparator(proto.getPattern(), proto.getPatternFlags(), engine);
171    } else {
172      comparator = new RegexStringComparator(proto.getPattern(), proto.getPatternFlags());
173    }
174    String charset = proto.getCharset();
175    if (charset.length() > 0) {
176      try {
177        comparator.getEngine().setCharset(charset);
178      } catch (IllegalCharsetNameException e) {
179        LOG.error("invalid charset", e);
180      }
181    }
182    return comparator;
183  }
184
185  /**
186   * @return true if and only if the fields of the comparator that are serialized are equal to the
187   *         corresponding fields in other. Used for testing.
188   */
189  @Override
190  boolean areSerializedFieldsEqual(ByteArrayComparable other) {
191    if (other == this) return true;
192    if (!(other instanceof RegexStringComparator)) return false;
193    RegexStringComparator comparator = (RegexStringComparator) other;
194    return super.areSerializedFieldsEqual(comparator)
195      && engine.getClass().isInstance(comparator.getEngine())
196      && engine.getPattern().equals(comparator.getEngine().getPattern())
197      && engine.getFlags() == comparator.getEngine().getFlags()
198      && engine.getCharset().equals(comparator.getEngine().getCharset());
199  }
200
201  Engine getEngine() {
202    return engine;
203  }
204
205  /**
206   * This is an internal interface for abstracting access to different regular expression matching
207   * engines.
208   */
209  static interface Engine {
210    /**
211     * Returns the string representation of the configured regular expression for matching
212     */
213    String getPattern();
214
215    /**
216     * Returns the set of configured match flags, a bit mask that may include {@link Pattern} flags
217     */
218    int getFlags();
219
220    /**
221     * Returns the name of the configured charset
222     */
223    String getCharset();
224
225    /**
226     * Set the charset used when matching
227     * @param charset the name of the desired charset for matching
228     */
229    void setCharset(final String charset);
230
231    /**
232     * Return the serialized form of the configured matcher
233     */
234    byte[] toByteArray();
235
236    /**
237     * Match the given input against the configured pattern
238     * @param value  the data to be matched
239     * @param offset offset of the data to be matched
240     * @param length length of the data to be matched
241     * @return 0 if a match was made, 1 otherwise
242     */
243    int compareTo(byte[] value, int offset, int length);
244  }
245
246  /**
247   * Implementation of the Engine interface using Java's Pattern.
248   * <p>
249   * This is the default engine.
250   */
251  static class JavaRegexEngine implements Engine {
252    private Charset charset = Charset.forName("UTF-8");
253    private Pattern pattern;
254
255    public JavaRegexEngine(String regex, int flags) {
256      this.pattern = Pattern.compile(regex, flags);
257    }
258
259    @Override
260    public String getPattern() {
261      return pattern.toString();
262    }
263
264    @Override
265    public int getFlags() {
266      return pattern.flags();
267    }
268
269    @Override
270    public String getCharset() {
271      return charset.name();
272    }
273
274    @Override
275    public void setCharset(String charset) {
276      this.charset = Charset.forName(charset);
277    }
278
279    @Override
280    public int compareTo(byte[] value, int offset, int length) {
281      // Use find() for subsequence match instead of matches() (full sequence
282      // match) to adhere to the principle of least surprise.
283      String tmp;
284      if (length < value.length / 2) {
285        // See HBASE-9428. Make a copy of the relevant part of the byte[],
286        // or the JDK will copy the entire byte[] during String decode
287        tmp = new String(Arrays.copyOfRange(value, offset, offset + length), charset);
288      } else {
289        tmp = new String(value, offset, length, charset);
290      }
291      return pattern.matcher(tmp).find() ? 0 : 1;
292    }
293
294    @Override
295    public byte[] toByteArray() {
296      ComparatorProtos.RegexStringComparator.Builder builder =
297        ComparatorProtos.RegexStringComparator.newBuilder();
298      builder.setPattern(pattern.pattern());
299      builder.setPatternFlags(pattern.flags());
300      builder.setCharset(charset.name());
301      builder.setEngine(EngineType.JAVA.name());
302      return builder.build().toByteArray();
303    }
304  }
305
306  /**
307   * Implementation of the Engine interface using Jruby's joni regex engine.
308   * <p>
309   * This engine operates on byte arrays directly so is expected to be more GC friendly, and
310   * reportedly is twice as fast as Java's Pattern engine.
311   * <p>
312   * NOTE: Only the {@link Pattern} flags CASE_INSENSITIVE, DOTALL, and MULTILINE are supported.
313   */
314  static class JoniRegexEngine implements Engine {
315    // When using UTF8Encoding, an infinite loop can occur if an invalid UTF8 is encountered.
316    // Use NonStrictUTF8Encoding instead of UTF8Encoding to avoid the issue.
317    private Encoding encoding = NonStrictUTF8Encoding.INSTANCE;
318    private String regex;
319    private Regex pattern;
320
321    public JoniRegexEngine(String regex, int flags) {
322      this.regex = regex;
323      byte[] b = Bytes.toBytes(regex);
324      this.pattern = new Regex(b, 0, b.length, patternToJoniFlags(flags), encoding, Syntax.Java);
325    }
326
327    @Override
328    public String getPattern() {
329      return regex;
330    }
331
332    @Override
333    public int getFlags() {
334      return pattern.getOptions();
335    }
336
337    @Override
338    public String getCharset() {
339      return encoding.getCharsetName();
340    }
341
342    @Override
343    public void setCharset(String name) {
344      setEncoding(name);
345    }
346
347    @Override
348    public int compareTo(byte[] value, int offset, int length) {
349      // Use subsequence match instead of full sequence match to adhere to the
350      // principle of least surprise.
351      Matcher m = pattern.matcher(value);
352      return m.search(offset, length, pattern.getOptions()) < 0 ? 1 : 0;
353    }
354
355    @Override
356    public byte[] toByteArray() {
357      ComparatorProtos.RegexStringComparator.Builder builder =
358        ComparatorProtos.RegexStringComparator.newBuilder();
359      builder.setPattern(regex);
360      builder.setPatternFlags(joniToPatternFlags(pattern.getOptions()));
361      builder.setCharset(encoding.getCharsetName());
362      builder.setEngine(EngineType.JONI.name());
363      return builder.build().toByteArray();
364    }
365
366    private int patternToJoniFlags(int flags) {
367      int newFlags = 0;
368      if ((flags & Pattern.CASE_INSENSITIVE) != 0) {
369        newFlags |= Option.IGNORECASE;
370      }
371      if ((flags & Pattern.DOTALL) != 0) {
372        // This does NOT mean Pattern.MULTILINE
373        newFlags |= Option.MULTILINE;
374      }
375      if ((flags & Pattern.MULTILINE) != 0) {
376        // This is what Java 8's Nashorn engine does when using joni and
377        // translating Pattern's MULTILINE flag
378        newFlags &= ~Option.SINGLELINE;
379        newFlags |= Option.NEGATE_SINGLELINE;
380      }
381      return newFlags;
382    }
383
384    private int joniToPatternFlags(int flags) {
385      int newFlags = 0;
386      if ((flags & Option.IGNORECASE) != 0) {
387        newFlags |= Pattern.CASE_INSENSITIVE;
388      }
389      // This does NOT mean Pattern.MULTILINE, this is equivalent to Pattern.DOTALL
390      if ((flags & Option.MULTILINE) != 0) {
391        newFlags |= Pattern.DOTALL;
392      }
393      // This means Pattern.MULTILINE. Nice
394      if ((flags & Option.NEGATE_SINGLELINE) != 0) {
395        newFlags |= Pattern.MULTILINE;
396      }
397      return newFlags;
398    }
399
400    private void setEncoding(String name) {
401      EncodingDB.Entry e = EncodingDB.getEncodings().get(Bytes.toBytes(name));
402      if (e != null) {
403        encoding = e.getEncoding();
404      } else {
405        throw new IllegalCharsetNameException(name);
406      }
407    }
408  }
409}