Skip to content

Commit d6d0366

Browse files
committed
feat: add config ingester_traffic_overflow_action
and improve traffic overflow
1 parent 9a0aaa2 commit d6d0366

8 files changed

Lines changed: 219 additions & 46 deletions

File tree

agent/crates/public/src/queue/overwrite_queue.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ struct OverwriteQueue<T: Sized> {
5555

5656
counter: Counter,
5757

58+
total_overwritten_count: AtomicU64,
59+
5860
_marker: PhantomData<T>,
5961
}
6062

@@ -78,6 +80,7 @@ impl<T> OverwriteQueue<T> {
7880
notify: Condvar::new(),
7981
terminated: AtomicBool::new(false),
8082
counter: Counter::default(),
83+
total_overwritten_count: AtomicU64::new(0),
8184
_marker: PhantomData,
8285
}
8386
}
@@ -130,6 +133,8 @@ impl<T> OverwriteQueue<T> {
130133
self.counter
131134
.overwritten
132135
.fetch_add(to_overwrite as u64, Ordering::Relaxed);
136+
self.total_overwritten_count
137+
.fetch_add(to_overwrite as u64, Ordering::Relaxed);
133138
}
134139
}
135140
let free_after_end = self.size - (raw_end & (self.size - 1));
@@ -416,6 +421,13 @@ impl<T> Receiver<T> {
416421
}
417422
}
418423
}
424+
425+
pub fn total_overwritten_count(&self) -> u64 {
426+
self.counter()
427+
.queue
428+
.total_overwritten_count
429+
.load(Ordering::Relaxed)
430+
}
419431
}
420432

421433
impl<T> Drop for Receiver<T> {

agent/src/config/config.rs

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2121,6 +2121,28 @@ impl Default for Ntp {
21212121
}
21222122
}
21232123

2124+
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
2125+
#[repr(u8)]
2126+
pub enum TrafficOverflowAction {
2127+
#[default]
2128+
Waiting = 0,
2129+
Dropping = 1,
2130+
}
2131+
2132+
fn to_traffic_overflow_action<'de, D>(deserializer: D) -> Result<TrafficOverflowAction, D::Error>
2133+
where
2134+
D: Deserializer<'de>,
2135+
{
2136+
match u8::deserialize(deserializer)? {
2137+
0 => Ok(TrafficOverflowAction::Waiting),
2138+
1 => Ok(TrafficOverflowAction::Dropping),
2139+
other => Err(de::Error::invalid_value(
2140+
Unexpected::Unsigned(other as u64),
2141+
&"[0-1]",
2142+
)),
2143+
}
2144+
}
2145+
21242146
#[derive(Clone, Debug, Deserialize, PartialEq, Eq)]
21252147
#[serde(default)]
21262148
pub struct Communication {
@@ -2133,6 +2155,8 @@ pub struct Communication {
21332155
#[serde(deserialize_with = "deser_usize_with_mega_unit")]
21342156
pub grpc_buffer_size: usize,
21352157
pub max_throughput_to_ingester: u64,
2158+
#[serde(deserialize_with = "to_traffic_overflow_action")]
2159+
pub ingester_traffic_overflow_action: TrafficOverflowAction,
21362160
pub request_via_nat_ip: bool,
21372161
pub proxy_controller_ip: String,
21382162
pub proxy_controller_port: u16,
@@ -2149,6 +2173,7 @@ impl Default for Communication {
21492173
ingester_port: 30033,
21502174
grpc_buffer_size: 5 << 20,
21512175
max_throughput_to_ingester: 100,
2176+
ingester_traffic_overflow_action: TrafficOverflowAction::Waiting,
21522177
request_via_nat_ip: false,
21532178
}
21542179
}

agent/src/config/handler.rs

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ use super::{
5151
HttpEndpointMatchRule, OracleConfig, PcapStream, PortConfig, ProcessorsFlowLogTunning,
5252
RequestLogTunning, SessionTimeout, TagFilterOperator, UserConfig,
5353
},
54-
ConfigError, KubernetesPollerType,
54+
ConfigError, KubernetesPollerType, TrafficOverflowAction,
5555
};
5656
use crate::flow_generator::protocol_logs::decode_new_rpc_trace_context_with_type;
5757
use crate::rpc::Session;
@@ -232,6 +232,7 @@ pub struct SenderConfig {
232232
pub npb_socket_type: agent::SocketType,
233233
pub multiple_sockets_to_ingester: bool,
234234
pub max_throughput_to_ingester: u64, // unit: Mbps
235+
pub ingester_traffic_overflow_action: TrafficOverflowAction,
235236
pub collector_socket_type: agent::SocketType,
236237
pub standalone_data_file_size: u32,
237238
pub standalone_data_file_dir: String,
@@ -1748,6 +1749,10 @@ impl TryFrom<(Config, UserConfig)> for ModuleConfig {
17481749
.throughput_monitoring_interval,
17491750
multiple_sockets_to_ingester: conf.outputs.socket.multiple_sockets_to_ingester,
17501751
max_throughput_to_ingester: conf.global.communication.max_throughput_to_ingester,
1752+
ingester_traffic_overflow_action: conf
1753+
.global
1754+
.communication
1755+
.ingester_traffic_overflow_action,
17511756
collector_socket_type: conf.outputs.socket.data_socket_type,
17521757
standalone_data_file_size: conf.global.standalone_mode.max_data_file_size,
17531758
standalone_data_file_dir: conf.global.standalone_mode.data_file_dir.clone(),
@@ -3875,6 +3880,17 @@ impl ConfigHandler {
38753880
);
38763881
communication.max_throughput_to_ingester = new_communication.max_throughput_to_ingester;
38773882
}
3883+
if communication.ingester_traffic_overflow_action
3884+
!= new_communication.ingester_traffic_overflow_action
3885+
{
3886+
info!(
3887+
"Update global.communication.ingester_traffic_overflow_action from {:?} to {:?}.",
3888+
communication.ingester_traffic_overflow_action,
3889+
new_communication.ingester_traffic_overflow_action
3890+
);
3891+
communication.ingester_traffic_overflow_action =
3892+
new_communication.ingester_traffic_overflow_action;
3893+
}
38783894
if communication.ingester_ip != new_communication.ingester_ip {
38793895
info!(
38803896
"Update global.communication.ingester_ip from {:?} to {:?}.",

agent/src/config/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ pub mod handler;
1919

2020
pub use config::{
2121
AgentIdType, Config, ConfigError, DpdkSource, KubernetesPollerType, OracleConfig, PcapStream,
22-
PrometheusExtraLabels, UserConfig, K8S_CA_CRT_PATH,
22+
PrometheusExtraLabels, TrafficOverflowAction, UserConfig, K8S_CA_CRT_PATH,
2323
};
2424
#[cfg(any(target_os = "linux", target_os = "android"))]
2525
pub use config::{ApiResources, ProcessMatcher};

agent/src/sender/uniform_sender.rs

Lines changed: 71 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,15 @@ use std::time::{Duration, Instant, SystemTime};
3030
use arc_swap::access::Access;
3131
use lazy_static::lazy_static;
3232
use log::{debug, error, info, warn};
33-
use public::sender::{SendMessageType, Sendable};
33+
use public::{
34+
leaky_bucket::LeakyBucket,
35+
sender::{SendMessageType, Sendable},
36+
};
3437
use rand::{thread_rng, RngCore};
3538

3639
use super::{get_sender_id, QUEUE_BATCH_SIZE};
3740

38-
use crate::config::handler::SenderAccess;
41+
use crate::config::{handler::SenderAccess, TrafficOverflowAction};
3942
use crate::exception::ExceptionHandler;
4043
use crate::trident::SenderEncoder;
4144
use crate::utils::stats::{
@@ -53,6 +56,7 @@ pub struct SenderCounter {
5356
pub tx: AtomicU64,
5457
pub tx_bytes: AtomicU64,
5558
pub dropped: AtomicU64,
59+
pub waited: AtomicU64,
5660
}
5761

5862
impl RefCountable for SenderCounter {
@@ -89,6 +93,11 @@ impl RefCountable for SenderCounter {
8993
CounterType::Counted,
9094
CounterValue::Unsigned(self.dropped.swap(0, Ordering::Relaxed)),
9195
),
96+
(
97+
"waited",
98+
CounterType::Counted,
99+
CounterValue::Unsigned(self.waited.swap(0, Ordering::Relaxed)),
100+
),
92101
]
93102
}
94103
}
@@ -153,10 +162,10 @@ impl<T: Sendable> Encoder<T> {
153162
version: HEADER_VESION,
154163
team_id: 0,
155164
organization_id: 0,
156-
agent_id: agent_id,
165+
agent_id,
157166
reserved_1: 0,
158167
reserved_2: 0,
159-
encoder: encoder,
168+
encoder,
160169
},
161170
_marker: PhantomData,
162171
}
@@ -340,6 +349,7 @@ lazy_static! {
340349
static ref TOTAL_SENT_BYTES: Arc<AtomicU64> = Arc::new(AtomicU64::new(0));
341350
static ref SENT_START_DURATION: Arc<AtomicU64> = Arc::new(AtomicU64::new(0));
342351
static ref LAST_LOGGING_DURATION: Arc<AtomicU64> = Arc::new(AtomicU64::new(0));
352+
static ref LEAKY_BUCKET: LeakyBucket = LeakyBucket::new(Some(0));
343353
}
344354

345355
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
@@ -380,6 +390,7 @@ pub struct UniformSender<T> {
380390

381391
input: Arc<Receiver<T>>,
382392
counter: Arc<SenderCounter>,
393+
overwritten_count: u64,
383394

384395
encoder: Encoder<T>,
385396
private_conn: Mutex<Connection>,
@@ -389,6 +400,8 @@ pub struct UniformSender<T> {
389400
multiple_sockets_to_ingester: bool,
390401
dest_ip: String,
391402
dest_port: u16,
403+
max_throughput_mbps: u64,
404+
ingester_traffic_overflow_action: TrafficOverflowAction,
392405

393406
config: SenderAccess,
394407

@@ -426,6 +439,7 @@ impl<T: Sendable> UniformSender<T> {
426439
name,
427440
input,
428441
counter: Arc::new(SenderCounter::default()),
442+
overwritten_count: 0,
429443
encoder: Encoder::new(
430444
0,
431445
SendMessageType::TaggedFlow,
@@ -440,6 +454,8 @@ impl<T: Sendable> UniformSender<T> {
440454
multiple_sockets_to_ingester: false,
441455
dest_ip: "127.0.0.1".to_string(),
442456
dest_port: cfg.dest_port,
457+
max_throughput_mbps: 0,
458+
ingester_traffic_overflow_action: TrafficOverflowAction::Waiting,
443459

444460
running,
445461
stats,
@@ -522,6 +538,11 @@ impl<T: Sendable> UniformSender<T> {
522538
}
523539

524540
fn send_buffer(&mut self) {
541+
if self.is_traffic_overflow()
542+
&& self.ingester_traffic_overflow_action == TrafficOverflowAction::Dropping
543+
{
544+
return;
545+
}
525546
let mut conn = match self.connection_type {
526547
ConnectionType::Global => self.global_shared_conn.lock().unwrap(),
527548
ConnectionType::PrivateShared => {
@@ -626,37 +647,51 @@ impl<T: Sendable> UniformSender<T> {
626647
}
627648
}
628649

629-
fn is_exceed_max_throughput(&mut self, max_throughput_mbps: u64) -> bool {
630-
if max_throughput_mbps == 0 {
631-
return false;
650+
fn log_when_traffic_overflow(&mut self) {
651+
let now = SystemTime::now()
652+
.duration_since(SystemTime::UNIX_EPOCH)
653+
.unwrap();
654+
// to prevent frequent log printing, print at least once every 10 seconds
655+
if now - Duration::from_nanos(LAST_LOGGING_DURATION.load(Ordering::Relaxed))
656+
> Duration::from_secs(10)
657+
{
658+
warn!(
659+
"{} sender dropping message, throughput exceed setting value 'max_throughput_to_ingester' {}Mbps, action {:?}, total overwrittern count {}",
660+
self.name, self.max_throughput_mbps, self.ingester_traffic_overflow_action, self.overwritten_count
661+
);
662+
LAST_LOGGING_DURATION.store(now.as_nanos() as u64, Ordering::Relaxed);
632663
}
633-
let max_throughput_bytes = max_throughput_mbps << 20 >> 3;
634-
if TOTAL_SENT_BYTES.load(Ordering::Relaxed) > max_throughput_bytes {
635-
let now = SystemTime::now()
636-
.duration_since(SystemTime::UNIX_EPOCH)
637-
.unwrap();
664+
}
638665

639-
let used = now - Duration::from_nanos(SENT_START_DURATION.load(Ordering::Relaxed));
640-
if used > Duration::from_secs(1) {
641-
SENT_START_DURATION.store(now.as_nanos() as u64, Ordering::Relaxed);
642-
TOTAL_SENT_BYTES.store(0, Ordering::Relaxed);
643-
} else {
644-
// to prevent frequent log printing, print at least once every 5 seconds
645-
if now - Duration::from_nanos(LAST_LOGGING_DURATION.load(Ordering::Relaxed))
646-
> Duration::from_secs(5)
647-
{
648-
warn!(
649-
"{} sender dropping message, throughput execeed setting value 'max_throughput_to_ingester' {}Mbps",
650-
self.name, max_throughput_mbps
651-
);
652-
LAST_LOGGING_DURATION.store(now.as_nanos() as u64, Ordering::Relaxed);
666+
fn is_traffic_overflow(&mut self) -> bool {
667+
if self.max_throughput_mbps == 0 {
668+
return false;
669+
}
670+
let mut overflow = false;
671+
if self.ingester_traffic_overflow_action == TrafficOverflowAction::Waiting {
672+
while !LEAKY_BUCKET.acquire(self.encoder.buffer_len() as u64) {
673+
// LEAKY_BUCKET token is updated every 100ms by default,
674+
// wait 10ms each time until the token is acquired
675+
thread::sleep(Duration::from_millis(10));
676+
if self.input.total_overwritten_count() > self.overwritten_count {
677+
overflow = true;
678+
self.overwritten_count = self.input.total_overwritten_count();
653679
}
654-
self.exception_handler
655-
.set(Exception::DataBpsThresholdExceeded);
656-
return true;
680+
self.counter.waited.fetch_add(1, Ordering::Relaxed);
657681
}
682+
} else {
683+
if !LEAKY_BUCKET.acquire(self.encoder.buffer_len() as u64) {
684+
overflow = true;
685+
self.counter.dropped.fetch_add(1, Ordering::Relaxed);
686+
}
687+
}
688+
689+
if overflow {
690+
self.exception_handler
691+
.set(Exception::DataBpsThresholdExceeded);
692+
self.log_when_traffic_overflow();
658693
}
659-
return false;
694+
overflow
660695
}
661696

662697
fn check_or_register_counterable(&mut self, message_type: SendMessageType) {
@@ -677,7 +712,12 @@ impl<T: Sendable> UniformSender<T> {
677712
while self.running.load(Ordering::Relaxed) {
678713
let config = self.config.load();
679714
let socket_type = config.collector_socket_type;
680-
let max_throughput_mpbs = config.max_throughput_to_ingester;
715+
let max_throughput_mbps = config.max_throughput_to_ingester;
716+
if self.max_throughput_mbps != max_throughput_mbps {
717+
LEAKY_BUCKET.set_rate(Some(max_throughput_mbps << 17)); // Mbit -> byte
718+
self.max_throughput_mbps = max_throughput_mbps;
719+
}
720+
self.ingester_traffic_overflow_action = config.ingester_traffic_overflow_action;
681721
match self.input.recv_all(
682722
&mut batch,
683723
Some(Duration::from_secs(Self::QUEUE_READ_TIMEOUT)),
@@ -688,13 +728,6 @@ impl<T: Sendable> UniformSender<T> {
688728
start_cached = Instant::now();
689729
self.cached = false;
690730
}
691-
if self.is_exceed_max_throughput(max_throughput_mpbs) {
692-
self.counter
693-
.dropped
694-
.fetch_add(batch.len() as u64, Ordering::Relaxed);
695-
batch.clear();
696-
continue;
697-
}
698731
for send_item in batch.drain(..) {
699732
if !self.running.load(Ordering::Relaxed) {
700733
break;

server/agent_config/README-CH.md

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1058,9 +1058,43 @@ global:
10581058
**详细描述**:
10591059

10601060
向 Server 端 Ingester 模块发送可观测性数据的最大允许流量,
1061-
超过此限速时数据将会主动丢弃、且采集器会标记为异常状态并触发告警
1061+
超限行为参考 `ingester_traffic_overflow_action` 配置描述
10621062
配置为 0 表示不限速。
10631063

1064+
### Ingester 流量超限的动作 {#global.communication.ingester_traffic_overflow_action}
1065+
1066+
**标签**:
1067+
1068+
`hot_update`
1069+
1070+
**FQCN**:
1071+
1072+
`global.communication.ingester_traffic_overflow_action`
1073+
1074+
**默认值**:
1075+
```yaml
1076+
global:
1077+
communication:
1078+
ingester_traffic_overflow_action: 0
1079+
```
1080+
1081+
**枚举可选值**:
1082+
| Value | Note |
1083+
| ----- | ---------------------------- |
1084+
| 0 | WAIT |
1085+
| 1 | DROP |
1086+
1087+
**模式**:
1088+
| Key | Value |
1089+
| ---- | ---------------------------- |
1090+
| Type | int |
1091+
1092+
**详细描述**:
1093+
1094+
Ingester 流量超限的动作
1095+
- WAIT:暂停发送,数据缓存到队列,等待下次发送。
1096+
- DROP:直接丢弃数据,并触发 Agent `数据流量达到限速`异常。
1097+
10641098
### 请求 NAT IP 地址 {#global.communication.request_via_nat_ip}
10651099

10661100
**标签**:

0 commit comments

Comments
 (0)