Skip to content

Sometimes gracefulReloads take a very long time #3345

@alex-vmw

Description

@alex-vmw

Describe the bug
We were performing tests that were issuing multiple /api/config.gracefulReload calls to fluentd. We have noticed that sometimes it took a very long time for gracefulReload to go through. For example, in this example we have about 14 mins between receiving a SIGUSR2 and worker=0 finally reloading:

2021-04-16 05:27:51 +0000 [debug]: #0 fluent/log.rb:308:debug: worker got SIGUSR2
...
2021-04-16 05:42:00 +0000 [info]: #0 fluent/log.rb:329:info: restart fluentd worker worker=0

To Reproduce
Reload fluentd via /api/config.gracefulReload call many times and watch the reload time. The issue might also be related to tailing lots of kubernetes container logs as this is what fluentd is doing on our k8s nodes.

Expected behavior
gracefulReload should take place very quickly.

Your Environment

  • Fluentd or td-agent version: fluentd 1.12.2
  • Operating system: Ubuntu 20.04.1 LTS
  • Kernel version: 5.4.0-42-generic

If you hit the problem with older fluentd version, try latest version first.

Your Configuration

<ROOT>
  <system>
    log_level info
    rpc_endpoint "127.0.0.1:24444"
  </system>
  <source>
    @type systemd
    @id in_systemd_docker
    path "/var/log/journal"
    tag "systemd.unit"
    read_from_head false
    <storage>
      @type "local"
      persistent true
      path "/var/log/-kube-fluentd-operator--fluentd-journald-cursor.json"
    </storage>
    <entry>
      field_map {"_SYSTEMD_UNIT":"unit","MESSAGE":"log","_PID":["pid"],"_PRIORITY":"priority","_COMM":"cmd","_HOSTNAME":"hostname"}
      field_map_strict true
      fields_lowercase true
    </entry>
  </source>
  <match systemd.unit>
    @type rewrite_tag_filter
    <rule>
      key "unit"
      pattern ^(.+)$
      tag "systemd.$1"
    </rule>
  </match>
  <filter systemd.kubelet.service>
    @type parser
    reserve_data true
    key_name "log"
    emit_invalid_record_to_error false
    <parse>
      @type "kubernetes"
      time_format "%m%d %H:%M:%S.%N"
      expression /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<message>.*)/m
    </parse>
  </filter>
  <source>
    @type tail
    @id in_tail_container_logs
    path "/var/log/containers/*.log"
    pos_file "/var/log/-kube-fluentd-operator--fluentd-containers.log.pos"
    tag "kubernetes.*"
    read_from_head true
    <parse>
      @type "multiline"
      format1 /^(?<partials>([^\n]+ (stdout|stderr) P [^\n]+\n)*)/
      format2 /(?<time>[^\n]+) (?<stream>stdout|stderr) F (?<log>[^\n]*)/
      format3 /|(?<json>{.*})/
      time_format "%Y-%m-%dT%H:%M:%S.%N%:z"
      unmatched_lines
    </parse>
  </source>
  <filter kubernetes.**>
    @type record_transformer
    @id filter_crio_container_logs
    enable_ruby true
    remove_keys partials
    <record>
      log ${record["partials"]&.gsub(/.+ (stdout|stderr) P (.+)\n/, '\\2')}${record["log"]}
    </record>
  </filter>
  <filter kubernetes.**>
    @type parser
    @id filter_docker_container_logs
    key_name "json"
    remove_key_name_field true
    reserve_data true
    emit_invalid_record_to_error false
    <parse>
      @type "json"
      time_format "%Y-%m-%dT%H:%M:%S.%NZ"
      time_type string
    </parse>
  </filter>
  <source>
    @type tail
    @id in_tail_startupscript
    path "/var/log/startupscript.log"
    pos_file "/var/log/-kube-fluentd-operator--fluentd-startupscript.log.pos"
    tag "startupscript"
    <parse>
      @type "syslog"
      unmatched_lines
    </parse>
  </source>
  <source>
    @type tail
    @id in_tail_docker
    path "/var/log/docker.log"
    pos_file "/var/log/-kube-fluentd-operator--fluentd-docker.log.pos"
    tag "docker"
    <parse>
      @type "regexp"
      expression /^time="(?<time>[^)]*)" level=(?<severity>[^ ]*) msg="(?<message>[^"]*)"( err="(?<error>[^"]*)")?( statusCode=($<status_code>\d+))?/
      unmatched_lines
    </parse>
  </source>
  <source>
    @type tail
    @id in_tail_etcd
    path "/var/log/etcd.log"
    pos_file "/var/log/-kube-fluentd-operator--fluentd-etcd.log.pos"
    tag "k8s.etcd"
    <parse>
      @type "none"
      unmatched_lines
    </parse>
  </source>
  <source>
    @type tail
    @id in_tail_kubelet
    multiline_flush_interval 5s
    path "/var/log/kubelet.log"
    pos_file "/var/log/-kube-fluentd-operator--fluentd-kubelet.log.pos"
    tag "k8s.kubelet"
    <parse>
      @type "kubernetes"
      unmatched_lines
      expression /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<message>.*)/m
      time_format "%m%d %H:%M:%S.%N"
    </parse>
  </source>
  <source>
    @type tail
    @id in_tail_kube_proxy
    multiline_flush_interval 5s
    path "/var/log/kube-proxy.log"
    pos_file "/var/log/-kube-fluentd-operator--fluentd-kube-proxy.log.pos"
    tag "k8s.kube-proxy"
    <parse>
      @type "kubernetes"
      unmatched_lines
      expression /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<message>.*)/m
      time_format "%m%d %H:%M:%S.%N"
    </parse>
  </source>
  <source>
    @type tail
    @id in_tail_kube_apiserver
    multiline_flush_interval 5s
    path "/var/log/kube-apiserver.log"
    pos_file "/var/log/-kube-fluentd-operator--fluentd-kube-apiserver.log.pos"
    tag "k8s.kube-apiserver"
    <parse>
      @type "kubernetes"
      unmatched_lines
      expression /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<message>.*)/m
      time_format "%m%d %H:%M:%S.%N"
    </parse>
  </source>
  <source>
    @type tail
    @id in_tail_kube_controller_manager
    multiline_flush_interval 5s
    path "/var/log/kube-controller-manager.log"
    pos_file "/var/log/-kube-fluentd-operator--fluentd-kube-controller-manager.log.pos"
    tag "k8s.kube-controller-manager"
    <parse>
      @type "kubernetes"
      unmatched_lines
      expression /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<message>.*)/m
      time_format "%m%d %H:%M:%S.%N"
    </parse>
  </source>
  <source>
    @type tail
    @id in_tail_kube_scheduler
    multiline_flush_interval 5s
    path "/var/log/kube-scheduler.log"
    pos_file "/var/log/-kube-fluentd-operator--fluentd-kube-scheduler.log.pos"
    tag "k8s.kube-scheduler"
    <parse>
      @type "kubernetes"
      unmatched_lines
      expression /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<message>.*)/m
      time_format "%m%d %H:%M:%S.%N"
    </parse>
  </source>
  <source>
    @type tail
    @id in_tail_rescheduler
    multiline_flush_interval 5s
    path "/var/log/rescheduler.log"
    pos_file "/var/log/-kube-fluentd-operator--fluentd-rescheduler.log.pos"
    tag "k8s.rescheduler"
    <parse>
      @type "kubernetes"
      unmatched_lines
      expression /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<message>.*)/m
      time_format "%m%d %H:%M:%S.%N"
    </parse>
  </source>
  <source>
    @type tail
    @id in_tail_glbc
    multiline_flush_interval 5s
    path "/var/log/glbc.log"
    pos_file "/var/log/-kube-fluentd-operator--fluentd-glbc.log.pos"
    tag "k8s.glbc"
    <parse>
      @type "kubernetes"
      unmatched_lines
      expression /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<message>.*)/m
      time_format "%m%d %H:%M:%S.%N"
    </parse>
  </source>
  <source>
    @type tail
    @id in_tail_cluster_autoscaler
    multiline_flush_interval 5s
    path "/var/log/cluster-autoscaler.log"
    pos_file "/var/log/-kube-fluentd-operator--fluentd-cluster-autoscaler.log.pos"
    tag "k8s.cluster-autoscaler"
    <parse>
      @type "kubernetes"
      unmatched_lines
      expression /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<message>.*)/m
      time_format "%m%d %H:%M:%S.%N"
    </parse>
  </source>
  <source>
    @type tail
    @id in_tail_kube_apiserver_audit
    multiline_flush_interval 5s
    path "/var/log/kubernetes/kube-apiserver-audit.log"
    pos_file "/var/log/-kube-fluentd-operator--kube-apiserver-audit.log.pos"
    tag "k8s.kube-apiserver-audit"
    <parse>
      @type "multiline"
      format_firstline "/^\\S+\\s+AUDIT:/"
      format1 /^(?<time>\S+) AUDIT:(?: (?:id="(?<id>(?:[^"\\]|\\.)*)"|ip="(?<ip>(?:[^"\\]|\\.)*)"|method="(?<method>(?:[^"\\]|\\.)*)"|user="(?<user>(?:[^"\\]|\\.)*)"|groups="(?<groups>(?:[^"\\]|\\.)*)"|as="(?<as>(?:[^"\\]|\\.)*)"|asgroups="(?<asgroups>(?:[^"\\]|\\.)*)"|namespace="(?<namespace>(?:[^"\\]|\\.)*)"|uri="(?<uri>(?:[^"\\]|\\.)*)"|response="(?<response>(?:[^"\\]|\\.)*)"|\w+="(?:[^"\\]|\\.)*"))*/
      time_format "%Y-%m-%dT%T.%L%Z"
      unmatched_lines
    </parse>
  </source>
  <filter kubernetes.**>
    @type kubernetes_metadata
    @id filter_kube_metadata
  </filter>
  <filter kubernetes.**>
    @type kubernetes_metadata
  </filter>
  <filter kubernetes.**>
    @type record_transformer
    enable_ruby true
    <record>
      kubernetes_namespace_container_name ${record["kubernetes"]["namespace_name"]}.${record["kubernetes"]["pod_name"]}.${record["kubernetes"]["container_name"]}
      container_info ${record["docker"]["container_id"]}-${record["stream"]}
    </record>
  </filter>
  <match kubernetes.**>
    @type rewrite_tag_filter
    <rule>
      key "kubernetes_namespace_container_name"
      pattern ^(.+)$
      tag "kube.$1"
    </rule>
  </match>
  <filter kube.*.*.*>
    @type record_modifier
    remove_keys "dummy_"
    <record>
      dummy_ ${record["kubernetes"]&.delete("master_url"); record["kubernetes"]&.delete("namespace_id"); if record["kubernetes"]&.has_key?("labels"); record["kubernetes"]["labels"].delete("pod-template-generation"); record["kubernetes"]["labels"].delete("controller-revision-hash");  record["kubernetes"]["labels"].delete("pod-template-hash"); end; nil}
    </record>
  </filter>
  <filter kube.*.*.*>
    @type record_transformer
    remove_keys kubernetes_namespace_container_name
  </filter>
  <filter kube.kube-system.**>
    @type parser
    reserve_data true
    key_name "log"
    emit_invalid_record_to_error false
    <parse>
      @type "kubernetes"
      time_format "%m%d %H:%M:%S.%N"
      expression /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<message>.*)/m
    </parse>
  </filter>
  <source>
    @type prometheus
  </source>
  <source>
    @type prometheus_monitor
  </source>
  <source>
    @type prometheus_output_monitor
  </source>
  <match systemd.** kube.kube-system.**>
    @type copy
    <store>
      @id X
      @type "vmware_loginsight"
      host "X.X.X.X"
      include_tag_key true
      log_text_keys ["log","msg","message"]
      port 9000
      scheme "http"
      serializer "json"
      ssl_verify false
      tag_key "tag"
    </store>
  </match>
  <match **>
    @type null
  </match>
</ROOT>

Your Error Log
fluentd-very-long-reload.log

Additional context

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions