46
46
import org .apache .doris .thrift .TWarmUpCacheAsyncResponse ;
47
47
48
48
import com .google .common .base .Preconditions ;
49
+ import lombok .Getter ;
49
50
import org .apache .logging .log4j .LogManager ;
50
51
import org .apache .logging .log4j .Logger ;
51
52
54
55
import java .util .HashSet ;
55
56
import java .util .List ;
56
57
import java .util .Map ;
58
+ import java .util .Objects ;
57
59
import java .util .Random ;
58
60
import java .util .Set ;
59
61
import java .util .concurrent .ConcurrentHashMap ;
@@ -94,7 +96,7 @@ public class CloudTabletRebalancer extends MasterDaemon {
94
96
95
97
private LinkedBlockingQueue <Pair <Long , Long >> tabletsMigrateTasks = new LinkedBlockingQueue <Pair <Long , Long >>();
96
98
97
- private Map <Long , InfightTask > tabletToInfightTask = new HashMap <Long , InfightTask >();
99
+ private Map <InfightTablet , InfightTask > tabletToInfightTask = new HashMap <>();
98
100
99
101
private long assignedErrNum = 0 ;
100
102
@@ -115,12 +117,39 @@ public enum BalanceType {
115
117
PARTITION
116
118
}
117
119
120
+ @ Getter
121
+ private class InfightTablet {
122
+ private final Long tabletId ;
123
+ private final String clusterId ;
124
+
125
+ public InfightTablet (Long tabletId , String clusterId ) {
126
+ this .tabletId = tabletId ;
127
+ this .clusterId = clusterId ;
128
+ }
129
+
130
+ @ Override
131
+ public boolean equals (Object o ) {
132
+ if (this == o ) {
133
+ return true ;
134
+ }
135
+ if (o == null || getClass () != o .getClass ()) {
136
+ return false ;
137
+ }
138
+ InfightTablet that = (InfightTablet ) o ;
139
+ return tabletId .equals (that .tabletId ) && clusterId .equals (that .clusterId );
140
+ }
141
+
142
+ @ Override
143
+ public int hashCode () {
144
+ return Objects .hash (tabletId , clusterId );
145
+ }
146
+ }
147
+
118
148
private class InfightTask {
119
149
public Tablet pickedTablet ;
120
150
public long srcBe ;
121
151
public long destBe ;
122
152
public boolean isGlobal ;
123
- public String clusterId ;
124
153
public Map <Long , List <Tablet >> beToTablets ;
125
154
public long startTimestamp ;
126
155
BalanceType balanceType ;
@@ -343,41 +372,44 @@ public void globalBalance() {
343
372
}
344
373
345
374
public void checkInflghtWarmUpCacheAsync () {
346
- Map <Long , List <Long >> beToTabletIds = new HashMap <Long , List <Long >>();
375
+ Map <Long , List <InfightTask >> beToInfightTasks = new HashMap <Long , List <InfightTask >>();
347
376
348
- for (Map .Entry <Long , InfightTask > entry : tabletToInfightTask .entrySet ()) {
349
- beToTabletIds .putIfAbsent (entry .getValue ().destBe , new ArrayList <Long >());
350
- beToTabletIds .get (entry .getValue ().destBe ).add (entry .getValue (). pickedTablet . getId ());
377
+ for (Map .Entry <InfightTablet , InfightTask > entry : tabletToInfightTask .entrySet ()) {
378
+ beToInfightTasks .putIfAbsent (entry .getValue ().destBe , new ArrayList <>());
379
+ beToInfightTasks .get (entry .getValue ().destBe ).add (entry .getValue ());
351
380
}
352
381
353
382
List <UpdateCloudReplicaInfo > infos = new ArrayList <>();
354
- for (Map .Entry <Long , List <Long >> entry : beToTabletIds .entrySet ()) {
383
+ for (Map .Entry <Long , List <InfightTask >> entry : beToInfightTasks .entrySet ()) {
355
384
LOG .info ("before pre cache check dest be {} inflight task num {}" , entry .getKey (), entry .getValue ().size ());
356
385
Backend destBackend = cloudSystemInfoService .getBackend (entry .getKey ());
357
386
if (destBackend == null ) {
358
- for (long tabletId : entry .getValue ()) {
359
- tabletToInfightTask .remove (tabletId );
387
+ for (InfightTask task : entry .getValue ()) {
388
+ for (InfightTablet key : tabletToInfightTask .keySet ()) {
389
+ tabletToInfightTask .remove (new InfightTablet (task .pickedTablet .getId (), key .clusterId ));
390
+ }
360
391
}
361
392
continue ;
362
393
}
363
-
364
- Map <Long , Boolean > taskDone = sendCheckWarmUpCacheAsyncRpc (entry .getValue (), entry .getKey ());
394
+ List <Long > tablets = entry .getValue ().stream ()
395
+ .map (task -> task .pickedTablet .getId ()).collect (Collectors .toList ());
396
+ Map <Long , Boolean > taskDone = sendCheckWarmUpCacheAsyncRpc (tablets , entry .getKey ());
365
397
if (taskDone == null ) {
366
398
LOG .warn ("sendCheckWarmUpCacheAsyncRpc return null be {}, inFight tasks {}" ,
367
399
entry .getKey (), entry .getValue ());
368
400
continue ;
369
401
}
370
-
402
+ String clusterId = cloudSystemInfoService . getBackend ( entry . getKey ()). getCloudClusterId ();
371
403
for (Map .Entry <Long , Boolean > result : taskDone .entrySet ()) {
372
- InfightTask task = tabletToInfightTask . get ( result . getKey ());
373
- if ( result .getValue ()
374
- || System .currentTimeMillis () / 1000 - task .startTimestamp
375
- > Config .cloud_pre_heating_time_limit_sec ) {
404
+ InfightTask task = tabletToInfightTask
405
+ . getOrDefault ( new InfightTablet ( result .getKey (), clusterId ), null );
406
+ if ( task != null && ( result . getValue () || System .currentTimeMillis () / 1000 - task .startTimestamp
407
+ > Config .cloud_pre_heating_time_limit_sec )) {
376
408
if (!result .getValue ()) {
377
409
LOG .info ("{} pre cache timeout, forced to change the mapping" , result .getKey ());
378
410
}
379
- updateClusterToBeMap (task .pickedTablet , task .destBe , task . clusterId , infos );
380
- tabletToInfightTask .remove (result . getKey ( ));
411
+ updateClusterToBeMap (task .pickedTablet , task .destBe , clusterId , infos );
412
+ tabletToInfightTask .remove (new InfightTablet ( task . pickedTablet . getId (), clusterId ));
381
413
}
382
414
}
383
415
}
@@ -393,13 +425,13 @@ public void checkInflghtWarmUpCacheAsync() {
393
425
}
394
426
395
427
// recalculate inflight beToTablets, just for print the log
396
- beToTabletIds = new HashMap < Long , List < Long >> ();
397
- for (Map .Entry <Long , InfightTask > entry : tabletToInfightTask .entrySet ()) {
398
- beToTabletIds .putIfAbsent (entry .getValue ().destBe , new ArrayList <Long >());
399
- beToTabletIds .get (entry .getValue ().destBe ).add (entry .getValue (). pickedTablet . getId ());
428
+ beToInfightTasks . clear ();
429
+ for (Map .Entry <InfightTablet , InfightTask > entry : tabletToInfightTask .entrySet ()) {
430
+ beToInfightTasks .putIfAbsent (entry .getValue ().destBe , new ArrayList <>());
431
+ beToInfightTasks .get (entry .getValue ().destBe ).add (entry .getValue ());
400
432
}
401
433
402
- for (Map .Entry <Long , List <Long >> entry : beToTabletIds .entrySet ()) {
434
+ for (Map .Entry <Long , List <InfightTask >> entry : beToInfightTasks .entrySet ()) {
403
435
LOG .info ("after pre cache check dest be {} inflight task num {}" , entry .getKey (), entry .getValue ().size ());
404
436
}
405
437
}
@@ -449,7 +481,7 @@ public void checkDecommissionState(Map<String, List<Long>> clusterToBes) {
449
481
}
450
482
LOG .info ("notify decommission response: {} " , response );
451
483
} catch (RpcException e ) {
452
- LOG .info ("failed to notify decommission {} " , e );
484
+ LOG .info ("failed to notify decommission" , e );
453
485
return ;
454
486
}
455
487
beToDecommissionedTime .put (beId , System .currentTimeMillis () / 1000 );
@@ -552,8 +584,10 @@ public void statRouteInfo() {
552
584
fillBeToTablets (bes .get (0 ), table .getId (), partition .getId (), index .getId (), tablet ,
553
585
tmpBeToTabletsGlobal , beToTabletsInTable , this .partitionToTablets );
554
586
555
- if (tabletToInfightTask .containsKey (tablet .getId ())) {
556
- InfightTask task = tabletToInfightTask .get (tablet .getId ());
587
+ InfightTask task = tabletToInfightTask
588
+ .getOrDefault (new InfightTablet (tablet .getId (), cluster ), null );
589
+
590
+ if (task != null ) {
557
591
fillBeToTablets (task .destBe , table .getId (), partition .getId (), index .getId (), tablet ,
558
592
futureBeToTabletsGlobal , futureBeToTabletsInTable , futurePartitionToTablets );
559
593
} else {
@@ -808,9 +842,7 @@ private boolean isConflict(long srcBe, long destBe, CloudReplica cloudReplica, B
808
842
List <Tablet > destBeTablets = beToTabletsInParts .get (cloudReplica .getPartitionId ())
809
843
.get (cloudReplica .getIndexId ()).get (destBe );
810
844
long minBeSize = destBeTablets == null ? 0 : destBeTablets .size ();
811
- if (minBeSize >= maxBeSize ) {
812
- return true ;
813
- }
845
+ return minBeSize >= maxBeSize ;
814
846
}
815
847
816
848
return false ;
@@ -881,10 +913,9 @@ private void balanceImpl(List<Long> bes, String clusterId, Map<Long, List<Tablet
881
913
task .srcBe = srcBe ;
882
914
task .destBe = destBe ;
883
915
task .balanceType = balanceType ;
884
- task .clusterId = clusterId ;
885
916
task .beToTablets = beToTablets ;
886
917
task .startTimestamp = System .currentTimeMillis () / 1000 ;
887
- tabletToInfightTask .put (pickedTablet .getId (), task );
918
+ tabletToInfightTask .put (new InfightTablet ( pickedTablet .getId (), clusterId ), task );
888
919
889
920
LOG .info ("pre cache {} from {} to {}, cluster {} minNum {} maxNum {} beNum {} tabletsNum {}, part {}" ,
890
921
pickedTablet .getId (), srcBe , destBe , clusterId ,
@@ -936,7 +967,7 @@ private void migrateTablets(Long srcBe, Long dstBe) {
936
967
CloudReplica cloudReplica = (CloudReplica ) tablet .getReplicas ().get (0 );
937
968
Backend be = cloudSystemInfoService .getBackend (srcBe );
938
969
if (be == null ) {
939
- LOG .info ("backend {} not found" , be );
970
+ LOG .info ("src backend {} not found" , srcBe );
940
971
continue ;
941
972
}
942
973
String clusterId = be .getCloudClusterId ();
0 commit comments