elk集群 docker-compose集群运行版

2023-12-14 17:29:46

elk集群 docker-compose集群运行版

机器分配

192.168.77.136 docker-compose

192.168.77.137 log-test cron

安装docker、docker-compose

centos

yum -y install docker-ce docker-compose

debian/ubuntu

apt -y install docker-ce docker-compose

编写docker-compose.yaml

mkdir /opt/elk
cd /opt/elk
vim logstash.yml 

http.host: "0.0.0.0"
xpack.monitoring.elasticsearch.hosts: [ "http://192.168.77.136:9200" ]

vim docker-compose.yaml
version: '3'
services:
  elasticsearch1:
    image: docker.elastic.co/elasticsearch/elasticsearch:7.14.0
    container_name: elasticsearch1
    environment:
      - node.name=elasticsearch1
      - cluster.name=es-docker-cluster
      - discovery.seed_hosts=elasticsearch2,elasticsearch3
      - cluster.initial_master_nodes=elasticsearch1,elasticsearch2
      - bootstrap.memory_lock=true
      - "ES_JAVA_OPTS=-Xms512m -Xmx512m"
    ulimits:
      memlock:
        soft: -1
        hard: -1
    volumes:
      - data01:/usr/share/elasticsearch/data
    ports:
      - 9200:9200

  elasticsearch2:
    image: docker.elastic.co/elasticsearch/elasticsearch:7.14.0
    container_name: elasticsearch2
    environment:
      - node.name=elasticsearch2
      - cluster.name=es-docker-cluster
      - discovery.seed_hosts=elasticsearch1,elasticsearch3
      - cluster.initial_master_nodes=elasticsearch1,elasticsearch2
      - bootstrap.memory_lock=true
      - "ES_JAVA_OPTS=-Xms512m -Xmx512m"
    ulimits:
      memlock:
        soft: -1
        hard: -1
    volumes:
      - data02:/usr/share/elasticsearch/data

  elasticsearch3:
    image: docker.elastic.co/elasticsearch/elasticsearch:7.14.0
    container_name: elasticsearch3
    environment:
      - node.name=elasticsearch3
      - cluster.name=es-docker-cluster
      - discovery.seed_hosts=elasticsearch1,elasticsearch2
      - cluster.initial_master_nodes=elasticsearch1,elasticsearch2
      - node.master=false
      - bootstrap.memory_lock=true
      - "ES_JAVA_OPTS=-Xms512m -Xmx512m"
    ulimits:
      memlock:
        soft: -1
        hard: -1
    volumes:
      - data03:/usr/share/elasticsearch/data

  logstash:
    image: docker.elastic.co/logstash/logstash:7.14.0
    volumes:
      - /var/log:/host/var/log
      - /opt/elk/logstash.yml:/usr/share/logstash/config/logstash.yml  
      - /opt/elk/pipeline:/usr/share/logstash/pipeline
    ports:
      - 5000:5000

  kibana:
    image: docker.elastic.co/kibana/kibana:7.14.0
    ports:
      - 5601:5601
    environment:
      - ELASTICSEARCH_HOSTS=http://elasticsearch1:9200

volumes:
  data01:
    driver: local
  data02:
    driver: local  
  data03:
    driver: local
##被收集日志赋权
chmod +x /var/log
chmod -R o+r /var/log
## 创建配置文件
mkdir /opt/elk/pipeline
cd /opt/elk/pipeline


[root@localhost pipeline]# cat logstash-cron.conf 
input {
  file {
    path => "/host/var/log/cron*"  # 调整为实际的 cron 日志路径
    start_position => "beginning"
    sincedb_path => "/dev/null"
    type => "cron"  # 这将在日志事件中添加一个字段,用于表示日志的类型
  }
}

filter {
  if [type] == "cron" {
    grok {
      match => { "message" => "%{GREEDYDATA:cron_message}" }
      add_field => { "source_ip" => "192.168.77.136" }  # 将这个 IP 地址更改为实际的源服务器 IP 地址
    }
  }
}

output {
  if [type] == "cron" {
    elasticsearch {
      hosts => ["192.168.77.136:9200"]
      index => "cron_%{source_ip}-%{+YYYY.MM.dd}"
    }
  }
  stdout { codec => rubydebug }
}

[root@localhost pipeline]# cat logstash-nginx.conf 
input {
  file {
    path => "/host/var/log/nginx/*"  # 调整为实际的 cron 日志路径
    start_position => "beginning"
    sincedb_path => "/dev/null"
    type => "nginx"  # 这将在日志事件中添加一个字段,用于表示日志的类型
  }
}

filter {
  if [type] == "nginx" {
    grok {
      match => { "message" => "%{GREEDYDATA:cron_message}" }
      add_field => { "source_ip" => "192.168.77.136" }  # 将这个 IP 地址更改为实际的源服务器 IP 地址
    }
  }
}

output {
  if [type] == "nginx" {
    elasticsearch {
      hosts => ["192.168.77.136:9200"]
      index => "nginx_%{source_ip}-%{+YYYY.MM.dd}"
    }
  }
  stdout { codec => rubydebug }
}

注意权限问题:对日志文件夹有执行权限,对日志文件有读取权限,关闭selinx

启停

cd /opt/elk
docker-compose up -d > elk.log
docker-compose down

其他日志收集服务器

##被收集日志赋权
chmod +x /var/log
chmod -R o+r /var/log
##从主集群机器上拷贝配置文件
scp /opt/elk 192.168.77.137:/opt
##启动
  docker run --name elk_logstash_1 -d -p 5000:5000   -v /var/log:/host/var/log  -v /opt/elk/pipeline:/usr/share/logstash/pipeline  -v /opt/elk/logstash.yml:/usr/share/logstash/config/logstash.yml    docker.elastic.co/logstash/logstash:7.14.0

排错

如果遇到以下问题
[root@localhost elk]# curl http://192.168.77.136:9200/_cluster/health?pretty
{
"cluster_name" : "es-docker-cluster",
"status" : "red",
"timed_out" : false,
"number_of_nodes" : 3,
"number_of_data_nodes" : 3,
"active_primary_shards" : 8,
"active_shards" : 8,
"relocating_shards" : 0,
"initializing_shards" : 0,
"unassigned_shards" : 18,
"delayed_unassigned_shards" : 0,
"number_of_pending_tasks" : 2,
"number_of_in_flight_fetch" : 3,
"task_max_waiting_in_queue_millis" : 1013,
"active_shards_percent_as_number" : 30.76923076923077
}
##
"status" : "red",
### 
curl -X GET "192.168.77.136:9200/_cluster/allocation/explain?pretty"
查看
curl -X GET "192.168.77.136:9200/_cluster/allocation/explain?pretty"
{
"index" : ".geoip_databases",
"shard" : 0,
"primary" : false,
"current_state" : "unassigned",
"unassigned_info" : {
"reason" : "CLUSTER_RECOVERED",
"at" : "2023-12-10T10:43:38.178Z",
"last_allocation_status" : "no_attempt"
},
"can_allocate" : "throttled",
"allocate_explanation" : "allocation temporarily throttled",
"node_allocation_decisions" : [
{
"node_id" : "p0gVV46mSiaNR2eIxc8KRQ",
"node_name" : "elasticsearch2",
"transport_address" : "172.19.0.3:9300",
"node_attributes" : {
"ml.machine_memory" : "4072427520",
"ml.max_open_jobs" : "512",
"xpack.installed" : "true",
"ml.max_jvm_size" : "536870912",
"transform.node" : "true"
},
"node_decision" : "throttled",
"deciders" : [
{
"decider" : "throttling",
"decision" : "THROTTLE",
"explanation" : "reached the limit of outgoing shard recoveries [2] on the node [a1aLJNBxQ7G0kJw8SfI8xQ] which holds the primary, cluster setting [cluster.routing.allocation.node_concurrent_outgoing_recoveries=2] (can also be set via [cluster.routing.allocation.node_concurrent_recoveries])"
}
]
},
{
"node_id" : "yRljcHJOS121zoG97QpEMA",
"node_name" : "elasticsearch3",
"transport_address" : "172.19.0.5:9300",
"node_attributes" : {
"ml.machine_memory" : "4072427520",
"ml.max_open_jobs" : "512",
"xpack.installed" : "true",
"ml.max_jvm_size" : "536870912",
"transform.node" : "true"
},
"node_decision" : "throttled",
"deciders" : [
{
"decider" : "throttling",
"decision" : "THROTTLE",
"explanation" : "reached the limit of outgoing shard recoveries [2] on the node [a1aLJNBxQ7G0kJw8SfI8xQ] which holds the primary, cluster setting [cluster.routing.allocation.node_concurrent_outgoing_recoveries=2] (can also be set via [cluster.routing.allocation.node_concurrent_recoveries])"
}
]
},
{
"node_id" : "a1aLJNBxQ7G0kJw8SfI8xQ",
"node_name" : "elasticsearch1",
"transport_address" : "172.19.0.6:9300",
"node_attributes" : {
"ml.machine_memory" : "4072427520",
"xpack.installed" : "true",
"transform.node" : "true",
"ml.max_open_jobs" : "512",
"ml.max_jvm_size" : "536870912"
},
"node_decision" : "no",
"store" : {
"matching_size_in_bytes" : 40959603
},
"deciders" : [
{
"decider" : "same_shard",
"decision" : "NO",
"explanation" : "a copy of this shard is already allocated to this node [[.geoip_databases][0], node[a1aLJNBxQ7G0kJw8SfI8xQ], [P], s[STARTED], a[id=xg4XhDseQuuoRdFE9BQL_A]]"
},
{
"decider" : "throttling",
"decision" : "THROTTLE",
"explanation" : "reached the limit of outgoing shard recoveries [2] on the node [a1aLJNBxQ7G0kJw8SfI8xQ] which holds the primary, cluster setting [cluster.routing.allocation.node_concurrent_outgoing_recoveries=2] (can also be set via [cluster.routing.allocation.node_concurrent_recoveries])"
}
]
}
]
}
这段 Elasticsearch 集群分片分配信息显示,.geoip_databases 索引的副本分片(shard 0)没有被分配( current_state : "unassigned")。原因是集群的每个节点都被暂时阻止("node_decision" : "throttled")分配新的分片。

这主要是由于 cluster.routing.allocation.node_concurrent_outgoing_recoveries 集群设置的限制。这个设置限制了单个节点可以同时进行的迁出分片恢复操作的数量,其默认值是 2。在你的集群中,每个节点都达到了这个限制,而这个限制又阻止了新的分片分配。

对于这个问题,你有两个可能的解决方法:

临时增大 cluster.routing.allocation.node_concurrent_outgoing_recoveries 的值。你可以将这个值设为 余下未分配分片数和节点数之间的较小值。例如,你的集群有 18 个未分配的分片和 3 个可用节点,你可以将 cluster.routing.allocation.node_concurrent_outgoing_recoveries 的值设为 6。请注意,增大这个值可能会增大集群的负载。
用以下的 API 指令可以增大这个值:

curl -X PUT "192.168.77.136:9200/_cluster/settings" -H 'Content-Type: application/json' -d'
{
  "transient" :{
      "cluster.routing.allocation.node_concurrent_outgoing_recoveries" : 6
   }
}'
等待当前的恢复操作完成。一旦当前的恢复操作完成,新的分片就可以被分配了。在实际生产环境中,最好的做法是避免产生大量未分配的分片。你可以更频繁地检查你的集群健康状态,一旦问题发生,立即进行处理。

文章来源:https://blog.csdn.net/qq_42704442/article/details/134994712
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。