-
Notifications
You must be signed in to change notification settings - Fork 3.7k
/
Copy pathdata_stream.rs
1790 lines (1624 loc) · 72 KB
/
data_stream.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright © Aptos Foundation
// Parts of the project are originally copyright © Meta Platforms, Inc.
// SPDX-License-Identifier: Apache-2.0
use crate::{
data_notification,
data_notification::{
DataClientRequest, DataNotification, DataPayload, EpochEndingLedgerInfosRequest,
NewTransactionOutputsWithProofRequest, NewTransactionsOrOutputsWithProofRequest,
NewTransactionsWithProofRequest, NotificationId, NumberOfStatesRequest,
StateValuesWithProofRequest, SubscribeTransactionOutputsWithProofRequest,
SubscribeTransactionsOrOutputsWithProofRequest, SubscribeTransactionsWithProofRequest,
TransactionOutputsWithProofRequest, TransactionsOrOutputsWithProofRequest,
TransactionsWithProofRequest,
},
dynamic_prefetching::DynamicPrefetchingState,
error::Error,
logging::{LogEntry, LogEvent, LogSchema},
metrics,
metrics::{increment_counter, increment_counter_multiple_labels, start_timer},
stream_engine::{DataStreamEngine, StreamEngine},
streaming_client::{NotificationFeedback, StreamRequest},
streaming_service::StreamUpdateNotification,
};
use aptos_channels::aptos_channel;
use aptos_config::config::{AptosDataClientConfig, DataStreamingServiceConfig};
use aptos_data_client::{
global_summary::{AdvertisedData, GlobalDataSummary},
interface::{
AptosDataClientInterface, Response, ResponseContext, ResponseError, ResponsePayload,
SubscriptionRequestMetadata,
},
};
use aptos_id_generator::{IdGenerator, U64IdGenerator};
use aptos_infallible::Mutex;
use aptos_logger::prelude::*;
use aptos_time_service::{TimeService, TimeServiceTrait};
use futures::{channel::mpsc, stream::FusedStream, SinkExt, Stream};
use std::{
cmp::min,
collections::{BTreeMap, VecDeque},
pin::Pin,
sync::Arc,
task::{Context, Poll},
time::{Duration, Instant},
};
use tokio::task::JoinHandle;
// The frequency at which to log sent data request messages
const SENT_REQUESTS_LOG_FREQ_SECS: u64 = 1;
/// A unique ID used to identify each stream.
pub type DataStreamId = u64;
/// A pointer to a thread-safe `PendingClientResponse`.
pub type PendingClientResponse = Arc<Mutex<Box<data_notification::PendingClientResponse>>>;
/// Each data stream holds the original stream request from the client and tracks
/// the progress of the data stream to satisfy that request (e.g., the data that
/// has already been sent along the stream to the client and the in-flight Aptos
/// data client requests that have been sent to the network).
///
/// Note that it is the responsibility of the data stream to send data
/// notifications along the stream in sequential order (e.g., transactions and
/// proofs must be sent with monotonically increasing versions).
#[derive(Debug)]
pub struct DataStream<T> {
// The configuration for the data client
data_client_config: AptosDataClientConfig,
// The configuration for the streaming service
streaming_service_config: DataStreamingServiceConfig,
// The unique ID for this data stream. This is useful for logging.
data_stream_id: DataStreamId,
// The data client through which to fetch data from the Aptos network
aptos_data_client: T,
// The engine for this data stream
stream_engine: StreamEngine,
// The stream update notifier (to notify the streaming service that
// the stream has been updated, e.g., data is now ready to be processed).
stream_update_notifier: aptos_channel::Sender<(), StreamUpdateNotification>,
// The current queue of data client requests and pending responses. When the
// request at the head of the queue completes (i.e., we receive a response),
// a data notification can be created and sent along the stream.
sent_data_requests: Option<VecDeque<PendingClientResponse>>,
// Handles of all spawned tasks. This is useful for aborting the tasks in
// the case the stream is terminated prematurely.
spawned_tasks: Vec<JoinHandle<()>>,
// Maps a notification ID (sent along the data stream) to a response context.
notifications_to_responses: BTreeMap<NotificationId, ResponseContext>,
// The channel on which to send data notifications when they are ready.
notification_sender: mpsc::Sender<DataNotification>,
// A unique notification ID generator
notification_id_generator: Arc<U64IdGenerator>,
// Notification ID of the end of stream notification (when it has been sent)
stream_end_notification_id: Option<NotificationId>,
// The current failure count of the request at the head of the request queue.
// If this count becomes too large, the stream is evidently blocked (i.e.,
// unable to make progress) and will automatically terminate.
request_failure_count: u64,
// Whether the data stream has encountered an error trying to send a
// notification to the listener. If so, the stream is dead and it will
// stop sending notifications. This handles when clients drop the listener.
send_failure: bool,
// The measured subscription stream lag (if any)
subscription_stream_lag: Option<SubscriptionStreamLag>,
// The time service to track elapsed time (e.g., during stream lag checks)
time_service: TimeService,
// The dynamic prefetching state (if enabled)
dynamic_prefetching_state: DynamicPrefetchingState,
}
impl<T: AptosDataClientInterface + Send + Clone + 'static> DataStream<T> {
pub fn new(
data_client_config: AptosDataClientConfig,
data_stream_config: DataStreamingServiceConfig,
data_stream_id: DataStreamId,
stream_request: &StreamRequest,
stream_update_notifier: aptos_channel::Sender<(), StreamUpdateNotification>,
aptos_data_client: T,
notification_id_generator: Arc<U64IdGenerator>,
advertised_data: &AdvertisedData,
time_service: TimeService,
) -> Result<(Self, DataStreamListener), Error> {
// Create a new data stream listener
let (notification_sender, notification_receiver) =
mpsc::channel(data_stream_config.max_data_stream_channel_sizes as usize);
let data_stream_listener = DataStreamListener::new(data_stream_id, notification_receiver);
// Create a new stream engine
let stream_engine = StreamEngine::new(data_stream_config, stream_request, advertised_data)?;
// Create the dynamic prefetching state
let dynamic_prefetching_state =
DynamicPrefetchingState::new(data_stream_config, time_service.clone());
// Create a new data stream
let data_stream = Self {
data_client_config,
streaming_service_config: data_stream_config,
data_stream_id,
aptos_data_client,
stream_engine,
stream_update_notifier,
sent_data_requests: None,
spawned_tasks: vec![],
notifications_to_responses: BTreeMap::new(),
notification_sender,
notification_id_generator,
stream_end_notification_id: None,
request_failure_count: 0,
send_failure: false,
subscription_stream_lag: None,
time_service,
dynamic_prefetching_state,
};
Ok((data_stream, data_stream_listener))
}
/// Clears the sent data requests queue and drops all tasks
pub fn clear_sent_data_requests_queue(&mut self) {
// Clear all pending data requests
if let Some(sent_data_requests) = self.sent_data_requests.as_mut() {
sent_data_requests.clear();
}
// Abort all spawned tasks
self.abort_spawned_tasks();
}
/// Returns true iff the first batch of data client requests has been sent
pub fn data_requests_initialized(&self) -> bool {
self.sent_data_requests.is_some()
}
/// Resets the subscription stream lag on the data stream
fn reset_subscription_stream_lag(&mut self) {
// Reset the subscription stream lag metrics
metrics::set_subscription_stream_lag(0);
// Reset the stream lag
self.subscription_stream_lag = None;
}
/// Sets the subscription stream lag on the data stream
fn set_subscription_stream_lag(&mut self, subscription_stream_lag: SubscriptionStreamLag) {
// Update the subscription stream lag metrics
metrics::set_subscription_stream_lag(subscription_stream_lag.version_lag);
// Set the stream lag
self.subscription_stream_lag = Some(subscription_stream_lag)
}
/// Initializes the data client requests by sending out the first batch
pub fn initialize_data_requests(
&mut self,
global_data_summary: GlobalDataSummary,
) -> Result<(), Error> {
// Initialize the data client requests queue
self.sent_data_requests = Some(VecDeque::new());
// Create and send the data client requests to the network
self.create_and_send_client_requests(&global_data_summary)
}
/// Returns true iff the given `notification_id` was sent by this stream
pub fn sent_notification(&self, notification_id: &NotificationId) -> bool {
if let Some(stream_end_notification_id) = self.stream_end_notification_id {
if stream_end_notification_id == *notification_id {
return true;
}
}
self.notifications_to_responses
.get(notification_id)
.is_some()
}
/// Notifies the Aptos data client of a bad client response
pub fn handle_notification_feedback(
&self,
notification_id: &NotificationId,
notification_feedback: &NotificationFeedback,
) -> Result<(), Error> {
if self.stream_end_notification_id == Some(*notification_id) {
return if matches!(notification_feedback, NotificationFeedback::EndOfStream) {
Ok(())
} else {
Err(Error::UnexpectedErrorEncountered(format!(
"Invalid feedback given for stream end: {:?}",
notification_feedback
)))
};
}
let response_context = self
.notifications_to_responses
.get(notification_id)
.ok_or_else(|| {
Error::UnexpectedErrorEncountered(format!(
"Response context missing for notification ID: {:?}",
notification_id
))
})?;
let response_error = extract_response_error(notification_feedback)?;
self.notify_bad_response(response_context, response_error);
Ok(())
}
/// Creates and sends a batch of aptos data client requests to the network
fn create_and_send_client_requests(
&mut self,
global_data_summary: &GlobalDataSummary,
) -> Result<(), Error> {
// Calculate the number of in-flight requests (i.e., requests that haven't completed)
let num_pending_requests = self.get_num_pending_data_requests()?;
let num_complete_pending_requests = self.get_num_complete_pending_requests()?;
let num_in_flight_requests =
num_pending_requests.saturating_sub(num_complete_pending_requests);
// Calculate the max number of requests that can be sent now
let max_pending_requests = self.streaming_service_config.max_pending_requests;
let max_num_requests_to_send = if num_pending_requests >= max_pending_requests {
0 // We're already at the max number of pending requests (don't do anything)
} else {
// Otherwise, calculate the max number of requests to send based on
// the max concurrent requests and the number of pending request slots.
let remaining_concurrent_requests = self
.dynamic_prefetching_state
.get_max_concurrent_requests(&self.stream_engine)
.saturating_sub(num_in_flight_requests);
let remaining_request_slots = max_pending_requests.saturating_sub(num_pending_requests);
min(remaining_concurrent_requests, remaining_request_slots)
};
// Send the client requests
if max_num_requests_to_send > 0 {
let client_requests = self.stream_engine.create_data_client_requests(
max_num_requests_to_send,
global_data_summary,
self.notification_id_generator.clone(),
)?;
for client_request in &client_requests {
// Send the client request
let pending_client_response =
self.send_client_request(false, client_request.clone());
// Enqueue the pending response
self.get_sent_data_requests()?
.push_back(pending_client_response);
}
sample!(
SampleRate::Duration(Duration::from_secs(SENT_REQUESTS_LOG_FREQ_SECS)),
debug!(
(LogSchema::new(LogEntry::SendDataRequests)
.stream_id(self.data_stream_id)
.event(LogEvent::Success)
.message(&format!(
"Sent {:?} data requests to the network",
client_requests.len()
)))
)
);
}
// Update the counters for the complete and pending responses
metrics::set_complete_pending_data_responses(num_complete_pending_requests);
metrics::set_pending_data_responses(self.get_num_pending_data_requests()?);
Ok(())
}
/// Sends a given request to the data client to be forwarded to the network
/// and returns a pending client response. If `request_retry` is true
/// exponential backoff takes affect (i.e., to increase the request timeout).
fn send_client_request(
&mut self,
request_retry: bool,
data_client_request: DataClientRequest,
) -> PendingClientResponse {
// Create a new pending client response
let pending_client_response = Arc::new(Mutex::new(Box::new(
data_notification::PendingClientResponse::new(data_client_request.clone()),
)));
// Calculate the request timeout to use, based on the
// request type and the number of previous failures.
let request_timeout_ms = if data_client_request.is_optimistic_fetch_request() {
self.data_client_config.optimistic_fetch_timeout_ms
} else if data_client_request.is_subscription_request() {
self.data_client_config.subscription_response_timeout_ms
} else if !request_retry {
self.data_client_config.response_timeout_ms
} else {
let response_timeout_ms = self.data_client_config.response_timeout_ms;
let max_response_timeout_ms = self.data_client_config.max_response_timeout_ms;
// Exponentially increase the timeout based on the number of
// previous failures (but bounded by the max timeout).
let request_timeout_ms = min(
max_response_timeout_ms,
response_timeout_ms * (u32::pow(2, self.request_failure_count as u32) as u64),
);
// Update the retry counter and log the request
increment_counter_multiple_labels(
&metrics::RETRIED_DATA_REQUESTS,
data_client_request.get_label(),
&request_timeout_ms.to_string(),
);
info!(
(LogSchema::new(LogEntry::RetryDataRequest)
.stream_id(self.data_stream_id)
.message(&format!(
"Retrying data request type: {:?}, with new timeout: {:?} (ms)",
data_client_request.get_label(),
request_timeout_ms.to_string()
)))
);
request_timeout_ms
};
// Send the request to the network
let join_handle = spawn_request_task(
self.data_stream_id,
data_client_request,
self.aptos_data_client.clone(),
pending_client_response.clone(),
request_timeout_ms,
self.stream_update_notifier.clone(),
);
self.spawned_tasks.push(join_handle);
pending_client_response
}
// TODO(joshlind): this function shouldn't be blocking when trying to send! If there are
// multiple streams, a single blocked stream could cause them all to block.
async fn send_data_notification(
&mut self,
data_notification: DataNotification,
) -> Result<(), Error> {
if let Err(error) = self.notification_sender.send(data_notification).await {
let error = Error::UnexpectedErrorEncountered(error.to_string());
warn!(
(LogSchema::new(LogEntry::StreamNotification)
.stream_id(self.data_stream_id)
.event(LogEvent::Error)
.error(&error)
.message("Failed to send data notification to listener!"))
);
self.send_failure = true;
Err(error)
} else {
Ok(())
}
}
/// Returns true iff there was a send failure
pub fn send_failure(&self) -> bool {
self.send_failure
}
async fn send_end_of_stream_notification(&mut self) -> Result<(), Error> {
// Create end of stream notification
let notification_id = self.notification_id_generator.next();
let data_notification = DataNotification::new(notification_id, DataPayload::EndOfStream);
// Send the data notification
info!(
(LogSchema::new(LogEntry::EndOfStreamNotification)
.stream_id(self.data_stream_id)
.event(LogEvent::Pending)
.message("Sent the end of stream notification"))
);
self.stream_end_notification_id = Some(notification_id);
self.send_data_notification(data_notification).await
}
/// Processes any data client responses that have been received. Note: the
/// responses must be processed in FIFO order.
pub async fn process_data_responses(
&mut self,
global_data_summary: GlobalDataSummary,
) -> Result<(), Error> {
if self.stream_engine.is_stream_complete()
|| self.request_failure_count >= self.streaming_service_config.max_request_retry
|| self.send_failure
{
if !self.send_failure && self.stream_end_notification_id.is_none() {
self.send_end_of_stream_notification().await?;
}
return Ok(()); // There's nothing left to do
}
// Continuously process any ready data responses
while let Some(pending_response) = self.pop_pending_response_queue()? {
// Get the client request and response information
let maybe_client_response = pending_response.lock().client_response.take();
let client_response = maybe_client_response.ok_or_else(|| {
Error::UnexpectedErrorEncountered("The client response should be ready!".into())
})?;
let client_request = &pending_response.lock().client_request.clone();
// Process the client response
match client_response {
Ok(client_response) => {
// Sanity check and process the response
if sanity_check_client_response_type(client_request, &client_response) {
// If the response wasn't enough to satisfy the original request (e.g.,
// it was truncated), missing data should be requested.
let mut head_of_line_blocked = false;
match self.request_missing_data(client_request, &client_response.payload) {
Ok(missing_data_requested) => {
if missing_data_requested {
head_of_line_blocked = true; // We're now head of line blocked on the missing data
}
},
Err(error) => {
warn!(LogSchema::new(LogEntry::ReceivedDataResponse)
.stream_id(self.data_stream_id)
.event(LogEvent::Error)
.error(&error)
.message("Failed to determine if missing data was requested!"));
},
}
// If the request was a subscription request and the subscription
// stream is lagging behind the data advertisements, the stream
// engine should be notified (e.g., so that it can catch up).
if client_request.is_subscription_request() {
if let Err(error) = self.check_subscription_stream_lag(
&global_data_summary,
&client_response.payload,
) {
self.notify_new_data_request_error(client_request, error)?;
head_of_line_blocked = true; // We're now head of line blocked on the failed stream
}
}
// The response is valid, send the data notification to the client
self.send_data_notification_to_client(client_request, client_response)
.await?;
// If the request is for specific data, increase the prefetching limit.
// Note: we don't increase the limit for new data requests because
// those don't invoke the prefetcher (as we're already up-to-date).
if !client_request.is_new_data_request() {
self.dynamic_prefetching_state
.increase_max_concurrent_requests();
}
// If we're head of line blocked, we should return early
if head_of_line_blocked {
break;
}
} else {
// The sanity check failed
self.handle_sanity_check_failure(client_request, &client_response.context)?;
break; // We're now head of line blocked on the failed request
}
},
Err(error) => {
// Handle the error depending on the request type
if client_request.is_new_data_request() {
// The request was for new data. We should notify the
// stream engine and clear the requests queue.
self.notify_new_data_request_error(client_request, error)?;
} else {
// Decrease the prefetching limit on an error
self.dynamic_prefetching_state
.decrease_max_concurrent_requests();
// Handle the error and simply retry
self.handle_data_client_error(client_request, &error)?;
}
break; // We're now head of line blocked on the failed request
},
}
}
// Create and send further client requests to the network
// to ensure we're maximizing the number of concurrent requests.
self.create_and_send_client_requests(&global_data_summary)
}
/// Verifies that the subscription stream is not lagging too much (i.e.,
/// behind the data advertisements). If it is, an error is returned.
fn check_subscription_stream_lag(
&mut self,
global_data_summary: &GlobalDataSummary,
response_payload: &ResponsePayload,
) -> Result<(), aptos_data_client::error::Error> {
// Get the highest version sent in the subscription response
let highest_response_version = match response_payload {
ResponsePayload::NewTransactionsWithProof((transactions_with_proof, _)) => {
if let Some(first_version) = transactions_with_proof.first_transaction_version {
let num_transactions = transactions_with_proof.transactions.len();
first_version
.saturating_add(num_transactions as u64)
.saturating_sub(1) // first_version + num_txns - 1
} else {
return Err(aptos_data_client::error::Error::UnexpectedErrorEncountered(
"The first transaction version is missing from the stream response!".into(),
));
}
},
ResponsePayload::NewTransactionOutputsWithProof((outputs_with_proof, _)) => {
if let Some(first_version) = outputs_with_proof.first_transaction_output_version {
let num_outputs = outputs_with_proof.transactions_and_outputs.len();
first_version
.saturating_add(num_outputs as u64)
.saturating_sub(1) // first_version + num_outputs - 1
} else {
return Err(aptos_data_client::error::Error::UnexpectedErrorEncountered(
"The first output version is missing from the stream response!".into(),
));
}
},
_ => {
return Ok(()); // The response payload doesn't contain a subscription response
},
};
// Get the highest advertised version
let highest_advertised_version = global_data_summary
.advertised_data
.highest_synced_ledger_info()
.map(|ledger_info| ledger_info.ledger_info().version())
.ok_or(aptos_data_client::error::Error::UnexpectedErrorEncountered(
"The highest synced ledger info is missing from the global data summary!".into(),
))?;
// If the stream is not lagging behind, reset the lag and return
if highest_response_version >= highest_advertised_version {
self.reset_subscription_stream_lag();
return Ok(());
}
// Otherwise, the stream is lagging behind the advertised version.
// Check if the stream is beyond recovery (i.e., has failed).
let current_stream_lag =
highest_advertised_version.saturating_sub(highest_response_version);
if let Some(mut subscription_stream_lag) = self.subscription_stream_lag.take() {
// Check if the stream lag is beyond recovery
if subscription_stream_lag
.is_beyond_recovery(self.streaming_service_config, current_stream_lag)
{
return Err(
aptos_data_client::error::Error::SubscriptionStreamIsLagging(format!(
"The subscription stream is beyond recovery! Current lag: {:?}, last lag: {:?},",
current_stream_lag, subscription_stream_lag.version_lag
)),
);
}
// The stream is lagging, but it's not yet beyond recovery
self.set_subscription_stream_lag(subscription_stream_lag);
} else {
// The stream was not previously lagging, but it is now!
let subscription_stream_lag =
SubscriptionStreamLag::new(current_stream_lag, self.time_service.clone());
self.set_subscription_stream_lag(subscription_stream_lag);
}
Ok(())
}
/// Notifies the stream engine that a new data request error was encountered
fn notify_new_data_request_error(
&mut self,
client_request: &DataClientRequest,
error: aptos_data_client::error::Error,
) -> Result<(), Error> {
// Notify the stream engine and clear the requests queue
self.stream_engine
.notify_new_data_request_error(client_request, error)?;
self.clear_sent_data_requests_queue();
Ok(())
}
/// Requests any missing data from the previous client response
/// and returns true iff missing data was requested.
fn request_missing_data(
&mut self,
data_client_request: &DataClientRequest,
response_payload: &ResponsePayload,
) -> Result<bool, Error> {
// Identify if any missing data needs to be requested
if let Some(missing_data_request) =
create_missing_data_request(data_client_request, response_payload)?
{
// Increment the missing client request counter
increment_counter(
&metrics::SENT_DATA_REQUESTS_FOR_MISSING_DATA,
data_client_request.get_label(),
);
// Send the missing data request
let pending_client_response =
self.send_client_request(false, missing_data_request.clone());
// Push the pending response to the front of the queue
self.get_sent_data_requests()?
.push_front(pending_client_response);
return Ok(true); // Missing data was requested
}
Ok(false) // No missing data was requested
}
/// Pops and returns the first pending client response if the response has
/// been received. Returns `None` otherwise.
fn pop_pending_response_queue(&mut self) -> Result<Option<PendingClientResponse>, Error> {
let sent_data_requests = self.get_sent_data_requests()?;
let pending_client_response = if let Some(data_request) = sent_data_requests.front() {
if data_request.lock().client_response.is_some() {
// We've received a response! Pop the requests off the queue.
sent_data_requests.pop_front()
} else {
None
}
} else {
None
};
Ok(pending_client_response)
}
/// Handles a client response that failed sanity checks
fn handle_sanity_check_failure(
&mut self,
data_client_request: &DataClientRequest,
response_context: &ResponseContext,
) -> Result<(), Error> {
error!(LogSchema::new(LogEntry::ReceivedDataResponse)
.stream_id(self.data_stream_id)
.event(LogEvent::Error)
.message("Encountered a client response that failed the sanity checks!"));
self.notify_bad_response(response_context, ResponseError::InvalidPayloadDataType);
self.resend_data_client_request(data_client_request)
}
/// Handles an error returned by the data client in relation to a request
fn handle_data_client_error(
&mut self,
data_client_request: &DataClientRequest,
data_client_error: &aptos_data_client::error::Error,
) -> Result<(), Error> {
// Log the error
warn!(LogSchema::new(LogEntry::ReceivedDataResponse)
.stream_id(self.data_stream_id)
.event(LogEvent::Error)
.error(&data_client_error.clone().into())
.message("Encountered a data client error!"));
// TODO(joshlind): can we identify the best way to react to the error?
self.resend_data_client_request(data_client_request)
}
/// Resends a failed data client request and pushes the pending notification
/// to the head of the pending notifications batch.
fn resend_data_client_request(
&mut self,
data_client_request: &DataClientRequest,
) -> Result<(), Error> {
// Increment the number of client failures for this request
self.request_failure_count += 1;
// Resend the client request
let pending_client_response = self.send_client_request(true, data_client_request.clone());
// Push the pending response to the head of the sent requests queue
self.get_sent_data_requests()?
.push_front(pending_client_response);
Ok(())
}
/// Notifies the Aptos data client of a bad client response
fn notify_bad_response(
&self,
response_context: &ResponseContext,
response_error: ResponseError,
) {
let response_id = response_context.id;
info!(LogSchema::new(LogEntry::ReceivedDataResponse)
.stream_id(self.data_stream_id)
.event(LogEvent::Error)
.message(&format!(
"Notifying the data client of a bad response. Response id: {:?}, error: {:?}",
response_id, response_error
)));
response_context
.response_callback
.notify_bad_response(response_error);
}
/// Sends a data notification to the client along the stream
async fn send_data_notification_to_client(
&mut self,
data_client_request: &DataClientRequest,
data_client_response: Response<ResponsePayload>,
) -> Result<(), Error> {
let (response_context, response_payload) = data_client_response.into_parts();
// Create a new data notification
if let Some(data_notification) = self
.stream_engine
.transform_client_response_into_notification(
data_client_request,
response_payload,
self.notification_id_generator.clone(),
)?
{
// Update the metrics for the data notification send latency
metrics::observe_duration(
&metrics::DATA_NOTIFICATION_SEND_LATENCY,
data_client_request.get_label(),
response_context.creation_time,
);
// Save the response context for this notification ID
let notification_id = data_notification.notification_id;
self.insert_notification_response_mapping(notification_id, response_context)?;
// Send the notification along the stream
trace!(
(LogSchema::new(LogEntry::StreamNotification)
.stream_id(self.data_stream_id)
.event(LogEvent::Success)
.message(&format!(
"Sent a single stream notification! Notification ID: {:?}",
notification_id
)))
);
self.send_data_notification(data_notification).await?;
// Reset the failure count. We've sent a notification and can move on.
self.request_failure_count = 0;
}
Ok(())
}
fn insert_notification_response_mapping(
&mut self,
notification_id: NotificationId,
response_context: ResponseContext,
) -> Result<(), Error> {
if let Some(response_context) = self
.notifications_to_responses
.insert(notification_id, response_context)
{
Err(Error::UnexpectedErrorEncountered(format!(
"Duplicate sent notification ID found! \
Notification ID: {:?}, \
previous Response context: {:?}",
notification_id, response_context
)))
} else {
self.garbage_collect_notification_response_map()
}
}
fn garbage_collect_notification_response_map(&mut self) -> Result<(), Error> {
let max_notification_id_mappings =
self.streaming_service_config.max_notification_id_mappings;
let map_length = self.notifications_to_responses.len() as u64;
if map_length > max_notification_id_mappings {
let num_entries_to_remove = map_length
.checked_sub(max_notification_id_mappings)
.ok_or_else(|| {
Error::IntegerOverflow("Number of entries to remove has overflown!".into())
})?;
debug!(
(LogSchema::new(LogEntry::StreamNotification)
.stream_id(self.data_stream_id)
.event(LogEvent::Success)
.message(&format!(
"Garbage collecting {:?} items from the notification response map.",
num_entries_to_remove
)))
);
// Collect all the keys that need to removed. Note: BTreeMap keys
// are sorted, so we'll remove the lowest notification IDs. These
// will be the oldest notifications.
let mut all_keys = self.notifications_to_responses.keys();
let mut keys_to_remove = vec![];
for _ in 0..num_entries_to_remove {
if let Some(key_to_remove) = all_keys.next() {
keys_to_remove.push(*key_to_remove);
}
}
// Remove the keys
for key_to_remove in &keys_to_remove {
self.notifications_to_responses.remove(key_to_remove);
}
}
Ok(())
}
/// Verifies that the data required by the stream can be satisfied using the
/// currently advertised data in the network. If not, returns an error.
pub fn ensure_data_is_available(&self, advertised_data: &AdvertisedData) -> Result<(), Error> {
if !self
.stream_engine
.is_remaining_data_available(advertised_data)?
{
return Err(Error::DataIsUnavailable(format!(
"Unable to satisfy stream engine: {:?}, with advertised data: {:?}",
self.stream_engine, advertised_data
)));
}
Ok(())
}
/// Returns the number of pending requests in the sent data requests queue
/// that have already completed (i.e., are no longer in-flight).
fn get_num_complete_pending_requests(&mut self) -> Result<u64, Error> {
let mut num_complete_pending_requests = 0;
for sent_data_request in self.get_sent_data_requests()? {
if let Some(client_response) = sent_data_request.lock().client_response.as_ref() {
if client_response.is_ok() {
// Only count successful responses as complete. Failures will be retried
num_complete_pending_requests += 1;
}
}
}
Ok(num_complete_pending_requests)
}
/// Returns the number of pending requests in the sent data requests queue
fn get_num_pending_data_requests(&mut self) -> Result<u64, Error> {
let pending_data_requests = self.get_sent_data_requests()?;
let num_pending_data_requests = pending_data_requests.len() as u64;
Ok(num_pending_data_requests)
}
/// Assumes the caller has already verified that `sent_data_requests` has
/// been initialized.
fn get_sent_data_requests(&mut self) -> Result<&mut VecDeque<PendingClientResponse>, Error> {
self.sent_data_requests.as_mut().ok_or_else(|| {
Error::UnexpectedErrorEncountered("Sent data requests should be initialized!".into())
})
}
#[cfg(test)]
/// This is exposed and used only for test purposes.
pub fn get_sent_requests_and_notifications(
&mut self,
) -> (
&mut Option<VecDeque<PendingClientResponse>>,
&mut BTreeMap<NotificationId, ResponseContext>,
) {
let sent_requests = &mut self.sent_data_requests;
let sent_notifications = &mut self.notifications_to_responses;
(sent_requests, sent_notifications)
}
#[cfg(test)]
/// Returns the subscription stream lag (for testing)
pub fn get_subscription_stream_lag(&self) -> Option<SubscriptionStreamLag> {
self.subscription_stream_lag.clone()
}
}
impl<T> Drop for DataStream<T> {
/// Terminates the stream by aborting all spawned tasks
fn drop(&mut self) {
self.abort_spawned_tasks();
}
}
impl<T> DataStream<T> {
/// Aborts all currently spawned tasks. This is useful if the stream is
/// terminated prematurely, or if the sent data requests are cleared.
fn abort_spawned_tasks(&mut self) {
for spawned_task in &self.spawned_tasks {
spawned_task.abort();
}
}
}
/// A simple container to track the start time and lag of a subscription stream
#[derive(Clone, Debug)]
pub struct SubscriptionStreamLag {
pub start_time: Instant,
pub time_service: TimeService,
pub version_lag: u64,
}
impl SubscriptionStreamLag {
fn new(version_lag: u64, time_service: TimeService) -> Self {
Self {
start_time: time_service.now(),
time_service,
version_lag,
}
}
/// Returns true iff the subscription stream lag is considered to be
/// beyond recovery. This occurs when: (i) the stream is lagging for
/// too long; and (ii) the lag has increased since the last check.
fn is_beyond_recovery(
&mut self,
streaming_service_config: DataStreamingServiceConfig,
current_stream_lag: u64,
) -> bool {
// Calculate the total duration the stream has been lagging
let current_time = self.time_service.now();
let stream_lag_duration = current_time.duration_since(self.start_time);
let max_stream_lag_duration =
Duration::from_secs(streaming_service_config.max_subscription_stream_lag_secs);
// If the lag is further behind and enough time has passed, the stream has failed
let lag_has_increased = current_stream_lag > self.version_lag;
let lag_duration_exceeded = stream_lag_duration >= max_stream_lag_duration;
if lag_has_increased && lag_duration_exceeded {
return true; // The stream is beyond recovery
}
// Otherwise, update the stream lag if we've caught up.
// This will ensure the lag can only improve.
if current_stream_lag < self.version_lag {
self.version_lag = current_stream_lag;
}
false // The stream is not yet beyond recovery
}
}
/// Allows listening to data streams (i.e., streams of data notifications).