Reimplemented the accumulation for the head token using a StringBuilder instead of a JRuby array, added more tests to cover some 'buffer full' conditions

andsel · andsel · commit 09ad4ffc1f91 · 2024-10-01T10:53:27.000+02:00
diff --git a/logstash-core/src/main/java/org/logstash/common/BufferedTokenizerExt.java b/logstash-core/src/main/java/org/logstash/common/BufferedTokenizerExt.java
@@ -20,11 +20,7 @@
 
 package org.logstash.common;
 
-import org.jruby.Ruby;
-import org.jruby.RubyArray;
-import org.jruby.RubyClass;
-import org.jruby.RubyObject;
-import org.jruby.RubyString;
+import org.jruby.*;
 import org.jruby.anno.JRubyClass;
 import org.jruby.anno.JRubyMethod;
 import org.jruby.runtime.ThreadContext;
@@ -40,10 +36,12 @@ public class BufferedTokenizerExt extends RubyObject {
                                                                 freeze(RubyUtil.RUBY.getCurrentContext());
 
     private @SuppressWarnings("rawtypes") RubyArray input = RubyUtil.RUBY.newArray();
+    private StringBuilder headToken = new StringBuilder();
     private RubyString delimiter = NEW_LINE;
     private int sizeLimit;
     private boolean hasSizeLimit;
     private int inputSize;
+    private boolean bufferFullErrorNotified = false;
 
     public BufferedTokenizerExt(final Ruby runtime, final RubyClass metaClass) {
         super(runtime, metaClass);
@@ -66,7 +64,6 @@ public IRubyObject init(final ThreadContext context, IRubyObject[] args) {
      * Extract takes an arbitrary string of input data and returns an array of
      * tokenized entities, provided there were any available to extract.  This
      * makes for easy processing of datagrams using a pattern like:
-     *
      * {@code tokenizer.extract(data).map { |entity| Decode(entity) }.each do}
      *
      * @param context ThreadContext
@@ -77,22 +74,53 @@ public IRubyObject init(final ThreadContext context, IRubyObject[] args) {
     @SuppressWarnings("rawtypes")
     public RubyArray extract(final ThreadContext context, IRubyObject data) {
         final RubyArray entities = data.convertToString().split(delimiter, -1);
+        if (!bufferFullErrorNotified) {
+            input.clear();
+            input.addAll(entities);
+        } else {
+            // after a full buffer signal
+            if (input.isEmpty()) {
+                // after a buffer full error, the remaining part of the line, till next delimiter,
+                // has to be consumed, unless the input buffer doesn't still contain fragments of
+                // subsequent tokens.
+                entities.shift(context);
+                input.addAll(entities);
+            } else {
+                // merge last of the input with first of incoming data segment
+                if (!entities.isEmpty()) {
+                    RubyString last = ((RubyString) input.pop(context));
+                    RubyString nextFirst = ((RubyString) entities.shift(context));
+                    entities.unshift(last.concat(nextFirst));
+                    input.addAll(entities);
+                }
+            }
+        }
+
         if (hasSizeLimit) {
-            final int entitiesSize = ((RubyString) entities.first()).size();
+            if (bufferFullErrorNotified) {
+                bufferFullErrorNotified = false;
+                if (input.isEmpty()) {
+                    return RubyUtil.RUBY.newArray();
+                }
+            }
+            final int entitiesSize = ((RubyString) input.first()).size();
             if (inputSize + entitiesSize > sizeLimit) {
+                bufferFullErrorNotified = true;
+                headToken = new StringBuilder();
+                inputSize = 0;
+                input.shift(context); // consume the token fragment that generates the buffer full
                 throw new IllegalStateException("input buffer full");
             }
             this.inputSize = inputSize + entitiesSize;
         }
-        input.append(entities.shift(context));
-        if (entities.isEmpty()) {
+        headToken.append(input.shift(context)); // remove head
+        if (input.isEmpty()) {
             return RubyUtil.RUBY.newArray();
         }
-        entities.unshift(input.join(context));
-        input.clear();
-        input.append(entities.pop(context));
-        inputSize = ((RubyString) input.first()).size();
-        return entities;
+        input.unshift(RubyUtil.toRubyObject(headToken.toString())); // insert new head
+        headToken = new StringBuilder(input.pop(context).toJava(String.class)); // remove the tail of the data segment
+        inputSize = headToken.length();
+        return input;
     }
 
     /**
@@ -104,14 +132,14 @@ public RubyArray extract(final ThreadContext context, IRubyObject data) {
      */
     @JRubyMethod
     public IRubyObject flush(final ThreadContext context) {
-        final IRubyObject buffer = input.join(context);
-        input.clear();
+        final IRubyObject buffer = RubyUtil.toRubyObject(headToken.toString());
+        headToken = new StringBuilder();
         return buffer;
     }
 
     @JRubyMethod(name = "empty?")
     public IRubyObject isEmpty(final ThreadContext context) {
-        return input.empty_p();
+        return RubyBoolean.newBoolean(context.runtime, headToken.toString().isEmpty());
     }
 
 }
diff --git a/logstash-core/src/test/java/org/logstash/common/BufferedTokenizerExtTest.java b/logstash-core/src/test/java/org/logstash/common/BufferedTokenizerExtTest.java
@@ -83,7 +83,7 @@ public void shouldIgnoreEmptyPayload() {
     @Test
     public void shouldTokenizeEmptyPayloadWithNewline() {
         RubyArray<RubyString> tokens = (RubyArray<RubyString>) sut.extract(context, RubyUtil.RUBY.newString("\n"));
-        assertEquals(List.of(""), tokens);;
+        assertEquals(List.of(""), tokens);
 
         tokens = (RubyArray<RubyString>) sut.extract(context, RubyUtil.RUBY.newString("\n\n\n"));
         assertEquals(List.of("", "", ""), tokens);
diff --git a/logstash-core/src/test/java/org/logstash/common/BufferedTokenizerExtWithSizeLimitTest.java b/logstash-core/src/test/java/org/logstash/common/BufferedTokenizerExtWithSizeLimitTest.java
@@ -31,8 +31,7 @@
 
 import static org.hamcrest.MatcherAssert.assertThat;
 import static org.hamcrest.Matchers.containsString;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertThrows;
+import static org.junit.Assert.*;
 import static org.logstash.RubyUtil.RUBY;
 
 @SuppressWarnings("unchecked")
@@ -72,7 +71,7 @@ public void givenExtractedThrownLimitErrorWhenFeedFreshDataThenReturnTokenStarti
         assertThat(thrownException.getMessage(), containsString("input buffer full"));
 
         RubyArray<RubyString> tokens = (RubyArray<RubyString>) sut.extract(context, RubyUtil.RUBY.newString("\nanother"));
-        assertEquals("After buffer full error should resume form the end of line", List.of("kaboom", "another"), tokens);
+        assertEquals("After buffer full error should resume from the end of line", List.of("kaboom"), tokens);
     }
 
     @Test
@@ -84,7 +83,28 @@ public void givenExtractInvokedWithDifferentFramingAfterBufferFullErrorTWhenFeed
         });
         assertThat(thrownException.getMessage(), containsString("input buffer full"));
 
-        RubyArray<RubyString> tokens = (RubyArray<RubyString>) sut.extract(context, RubyUtil.RUBY.newString("aa\nbbbb"));
+        RubyArray<RubyString> tokens = (RubyArray<RubyString>) sut.extract(context, RubyUtil.RUBY.newString("aa\nbbbb\nccc"));
         assertEquals(List.of("bbbb"), tokens);
     }
+
+    @Test
+    public void giveMultipleSegmentsThatGeneratesMultipleBufferFullErrorsThenIsAbleToRecoverTokenization() {
+        sut.extract(context, RubyUtil.RUBY.newString("aaaa"));
+
+        //first buffer full on 13 "a" letters
+        Exception thrownException = assertThrows(IllegalStateException.class, () -> {
+            sut.extract(context, RubyUtil.RUBY.newString("aaaaaaa"));
+        });
+        assertThat(thrownException.getMessage(), containsString("input buffer full"));
+
+        // second buffer full on 11 "b" letters
+        Exception secondThrownException = assertThrows(IllegalStateException.class, () -> {
+            sut.extract(context, RubyUtil.RUBY.newString("aa\nbbbbbbbbbbb\ncc"));
+        });
+        assertThat(secondThrownException.getMessage(), containsString("input buffer full"));
+
+        // now should resemble processing on c and d
+        RubyArray<RubyString> tokens = (RubyArray<RubyString>) sut.extract(context, RubyUtil.RUBY.newString("ccc\nddd\n"));
+        assertEquals(List.of("ccccc", "ddd"), tokens);
+    }
 }