/build/source/nativelink-scheduler/src/cache_lookup_scheduler.rs
Line | Count | Source |
1 | | // Copyright 2024 The NativeLink Authors. All rights reserved. |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
4 | | // you may not use this file except in compliance with the License. |
5 | | // You may obtain a copy of the License at |
6 | | // |
7 | | // http://www.apache.org/licenses/LICENSE-2.0 |
8 | | // |
9 | | // Unless required by applicable law or agreed to in writing, software |
10 | | // distributed under the License is distributed on an "AS IS" BASIS, |
11 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | | // See the License for the specific language governing permissions and |
13 | | // limitations under the License. |
14 | | |
15 | | use std::collections::HashMap; |
16 | | use std::sync::Arc; |
17 | | |
18 | | use async_trait::async_trait; |
19 | | use nativelink_error::{make_err, Code, Error, ResultExt}; |
20 | | use nativelink_metric::{MetricsComponent, RootMetricsComponent}; |
21 | | use nativelink_proto::build::bazel::remote::execution::v2::{ |
22 | | ActionResult as ProtoActionResult, GetActionResultRequest, |
23 | | }; |
24 | | use nativelink_store::ac_utils::get_and_decode_digest; |
25 | | use nativelink_store::grpc_store::GrpcStore; |
26 | | use nativelink_util::action_messages::{ |
27 | | ActionInfo, ActionStage, ActionState, ActionUniqueKey, ActionUniqueQualifier, OperationId, |
28 | | }; |
29 | | use nativelink_util::background_spawn; |
30 | | use nativelink_util::common::DigestInfo; |
31 | | use nativelink_util::digest_hasher::DigestHasherFunc; |
32 | | use nativelink_util::known_platform_property_provider::KnownPlatformPropertyProvider; |
33 | | use nativelink_util::operation_state_manager::{ |
34 | | ActionStateResult, ActionStateResultStream, ClientStateManager, OperationFilter, |
35 | | }; |
36 | | use nativelink_util::store_trait::Store; |
37 | | use parking_lot::{Mutex, MutexGuard}; |
38 | | use scopeguard::guard; |
39 | | use tokio::sync::oneshot; |
40 | | use tonic::{Request, Response}; |
41 | | use tracing::{event, Level}; |
42 | | |
43 | | /// Actions that are having their cache checked or failed cache lookup and are |
44 | | /// being forwarded upstream. Missing the `skip_cache_check` actions which are |
45 | | /// forwarded directly. |
46 | | type CheckActions = HashMap< |
47 | | ActionUniqueKey, |
48 | | Vec<( |
49 | | OperationId, |
50 | | oneshot::Sender<Result<Box<dyn ActionStateResult>, Error>>, |
51 | | )>, |
52 | | >; |
53 | | |
54 | | #[derive(MetricsComponent)] |
55 | | pub struct CacheLookupScheduler { |
56 | | /// A reference to the AC to find existing actions in. |
57 | | /// To prevent unintended issues, this store should probably be a `CompletenessCheckingStore`. |
58 | | #[metric(group = "ac_store")] |
59 | | ac_store: Store, |
60 | | /// The "real" scheduler to use to perform actions if they were not found |
61 | | /// in the action cache. |
62 | | #[metric(group = "action_scheduler")] |
63 | | action_scheduler: Arc<dyn ClientStateManager>, |
64 | | /// Actions that are currently performing a `CacheCheck`. |
65 | | inflight_cache_checks: Arc<Mutex<CheckActions>>, |
66 | | } |
67 | | |
68 | 0 | async fn get_action_from_store( |
69 | 0 | ac_store: &Store, |
70 | 0 | action_digest: DigestInfo, |
71 | 0 | instance_name: String, |
72 | 0 | digest_function: DigestHasherFunc, |
73 | 0 | ) -> Result<ProtoActionResult, Error> { |
74 | | // If we are a GrpcStore we shortcut here, as this is a special store. |
75 | 0 | if let Some(grpc_store) = ac_store.downcast_ref::<GrpcStore>(Some(action_digest.into())) { Branch (75:12): [True: 0, False: 0]
Branch (75:12): [Folded - Ignored]
|
76 | 0 | let action_result_request = GetActionResultRequest { |
77 | 0 | instance_name, |
78 | 0 | action_digest: Some(action_digest.into()), |
79 | 0 | inline_stdout: false, |
80 | 0 | inline_stderr: false, |
81 | 0 | inline_output_files: Vec::new(), |
82 | 0 | digest_function: digest_function.proto_digest_func().into(), |
83 | 0 | }; |
84 | 0 | grpc_store |
85 | 0 | .get_action_result(Request::new(action_result_request)) |
86 | 0 | .await |
87 | 0 | .map(Response::into_inner) |
88 | | } else { |
89 | 0 | get_and_decode_digest::<ProtoActionResult>(ac_store, action_digest.into()).await |
90 | | } |
91 | 0 | } |
92 | | |
93 | | /// Future for when `ActionStateResults` are known. |
94 | | type ActionStateResultOneshot = oneshot::Receiver<Result<Box<dyn ActionStateResult>, Error>>; |
95 | | |
96 | 0 | fn subscribe_to_existing_action( |
97 | 0 | inflight_cache_checks: &mut MutexGuard<CheckActions>, |
98 | 0 | unique_qualifier: &ActionUniqueKey, |
99 | 0 | client_operation_id: &OperationId, |
100 | 0 | ) -> Option<ActionStateResultOneshot> { |
101 | 0 | inflight_cache_checks |
102 | 0 | .get_mut(unique_qualifier) |
103 | 0 | .map(|oneshots| { |
104 | 0 | let (tx, rx) = oneshot::channel(); |
105 | 0 | oneshots.push((client_operation_id.clone(), tx)); |
106 | 0 | rx |
107 | 0 | }) |
108 | 0 | } |
109 | | |
110 | | struct CacheLookupActionStateResult { |
111 | | action_state: Arc<ActionState>, |
112 | | change_called: bool, |
113 | | } |
114 | | |
115 | | #[async_trait] |
116 | | impl ActionStateResult for CacheLookupActionStateResult { |
117 | 0 | async fn as_state(&self) -> Result<Arc<ActionState>, Error> { |
118 | 0 | Ok(self.action_state.clone()) |
119 | 0 | } |
120 | | |
121 | 0 | async fn changed(&mut self) -> Result<Arc<ActionState>, Error> { |
122 | 0 | if self.change_called { Branch (122:12): [True: 0, False: 0]
Branch (122:12): [Folded - Ignored]
|
123 | 0 | return Err(make_err!( |
124 | 0 | Code::Internal, |
125 | 0 | "CacheLookupActionStateResult::changed called twice" |
126 | 0 | )); |
127 | 0 | } |
128 | 0 | self.change_called = true; |
129 | 0 | Ok(self.action_state.clone()) |
130 | 0 | } |
131 | | |
132 | 0 | async fn as_action_info(&self) -> Result<Arc<ActionInfo>, Error> { |
133 | | // TODO(allada) We should probably remove as_action_info() |
134 | | // or implement it properly. |
135 | 0 | return Err(make_err!( |
136 | 0 | Code::Unimplemented, |
137 | 0 | "as_action_info not implemented for CacheLookupActionStateResult::as_action_info" |
138 | 0 | )); |
139 | 0 | } |
140 | | } |
141 | | |
142 | | impl CacheLookupScheduler { |
143 | 2 | pub fn new( |
144 | 2 | ac_store: Store, |
145 | 2 | action_scheduler: Arc<dyn ClientStateManager>, |
146 | 2 | ) -> Result<Self, Error> { |
147 | 2 | Ok(Self { |
148 | 2 | ac_store, |
149 | 2 | action_scheduler, |
150 | 2 | inflight_cache_checks: Arc::default(), |
151 | 2 | }) |
152 | 2 | } |
153 | | |
154 | 1 | async fn inner_add_action( |
155 | 1 | &self, |
156 | 1 | client_operation_id: OperationId, |
157 | 1 | action_info: Arc<ActionInfo>, |
158 | 1 | ) -> Result<Box<dyn ActionStateResult>, Error> { |
159 | 1 | let unique_key0 = match &action_info.unique_qualifier { |
160 | 0 | ActionUniqueQualifier::Cachable(unique_key) => unique_key.clone(), |
161 | | ActionUniqueQualifier::Uncachable(_) => { |
162 | | // Cache lookup skipped, forward to the upstream. |
163 | 1 | return self |
164 | 1 | .action_scheduler |
165 | 1 | .add_action(client_operation_id, action_info) |
166 | 1 | .await; |
167 | | } |
168 | | }; |
169 | | |
170 | 0 | let cache_check_result = { |
171 | 0 | // Check this isn't a duplicate request first. |
172 | 0 | let mut inflight_cache_checks = self.inflight_cache_checks.lock(); |
173 | 0 | subscribe_to_existing_action( |
174 | 0 | &mut inflight_cache_checks, |
175 | 0 | &unique_key, |
176 | 0 | &client_operation_id, |
177 | 0 | ) |
178 | 0 | .ok_or_else(move || { |
179 | 0 | let (action_listener_tx, action_listener_rx) = oneshot::channel(); |
180 | 0 | inflight_cache_checks.insert( |
181 | 0 | unique_key.clone(), |
182 | 0 | vec![(client_operation_id, action_listener_tx)], |
183 | 0 | ); |
184 | 0 | // In the event we loose the reference to our `scope_guard`, it will remove |
185 | 0 | // the action from the inflight_cache_checks map. |
186 | 0 | let inflight_cache_checks = self.inflight_cache_checks.clone(); |
187 | 0 | ( |
188 | 0 | action_listener_rx, |
189 | 0 | guard((), move |()| { |
190 | 0 | inflight_cache_checks.lock().remove(&unique_key); |
191 | 0 | }), |
192 | 0 | ) |
193 | 0 | }) |
194 | | }; |
195 | 0 | let (action_listener_rx, scope_guard) = match cache_check_result { |
196 | 0 | Ok(action_listener_fut) => { |
197 | 0 | let action_listener = action_listener_fut.await.map_err(|_| { |
198 | 0 | make_err!( |
199 | 0 | Code::Internal, |
200 | 0 | "ActionStateResult tx hung up in CacheLookupScheduler::add_action" |
201 | 0 | ) |
202 | 0 | })?; |
203 | 0 | return action_listener; |
204 | | } |
205 | 0 | Err(client_tx_and_scope_guard) => client_tx_and_scope_guard, |
206 | 0 | }; |
207 | 0 |
|
208 | 0 | let ac_store = self.ac_store.clone(); |
209 | 0 | let action_scheduler = self.action_scheduler.clone(); |
210 | 0 | let inflight_cache_checks = self.inflight_cache_checks.clone(); |
211 | 0 | // We need this spawn because we are returning a stream and this spawn will populate the stream's data. |
212 | 0 | background_spawn!("cache_lookup_scheduler_add_action", async move { |
213 | 0 | // If our spawn ever dies, we will remove the action from the inflight_cache_checks map. |
214 | 0 | let _scope_guard = scope_guard; |
215 | | |
216 | 0 | let unique_key = match &action_info.unique_qualifier { |
217 | 0 | ActionUniqueQualifier::Cachable(unique_key) => unique_key, |
218 | 0 | ActionUniqueQualifier::Uncachable(unique_key) => { |
219 | 0 | event!( |
220 | 0 | Level::ERROR, |
221 | | ?action_info, |
222 | 0 | "ActionInfo::unique_qualifier should be ActionUniqueQualifier::Cachable()" |
223 | | ); |
224 | 0 | unique_key |
225 | | } |
226 | | }; |
227 | | |
228 | | // Perform cache check. |
229 | 0 | let instance_name = action_info.unique_qualifier.instance_name().clone(); |
230 | 0 | let maybe_action_result = get_action_from_store( |
231 | 0 | &ac_store, |
232 | 0 | action_info.unique_qualifier.digest(), |
233 | 0 | instance_name, |
234 | 0 | action_info.unique_qualifier.digest_function(), |
235 | 0 | ) |
236 | 0 | .await; |
237 | 0 | match maybe_action_result { |
238 | 0 | Ok(action_result) => { |
239 | 0 | let maybe_pending_txs = { |
240 | 0 | let mut inflight_cache_checks = inflight_cache_checks.lock(); |
241 | 0 | // We are ready to resolve the in-flight actions. We remove the |
242 | 0 | // in-flight actions from the map. |
243 | 0 | inflight_cache_checks.remove(unique_key) |
244 | | }; |
245 | 0 | let Some(pending_txs) = maybe_pending_txs else { Branch (245:25): [True: 0, False: 0]
Branch (245:25): [Folded - Ignored]
|
246 | 0 | return; // Nobody is waiting for this action anymore. |
247 | | }; |
248 | 0 | let mut action_state = ActionState { |
249 | 0 | client_operation_id: OperationId::default(), |
250 | 0 | stage: ActionStage::CompletedFromCache(action_result), |
251 | 0 | action_digest: action_info.unique_qualifier.digest(), |
252 | 0 | }; |
253 | | |
254 | 0 | for (client_operation_id, pending_tx) in pending_txs { |
255 | 0 | action_state.client_operation_id = client_operation_id; |
256 | 0 | // Ignore errors here, as the other end may have hung up. |
257 | 0 | let _ = pending_tx.send(Ok(Box::new(CacheLookupActionStateResult { |
258 | 0 | action_state: Arc::new(action_state.clone()), |
259 | 0 | change_called: false, |
260 | 0 | }))); |
261 | 0 | } |
262 | 0 | return; |
263 | | } |
264 | 0 | Err(err) => { |
265 | 0 | // NotFound errors just mean we need to execute our action. |
266 | 0 | if err.code != Code::NotFound { Branch (266:24): [True: 0, False: 0]
Branch (266:24): [Folded - Ignored]
|
267 | 0 | let err = err.append("In CacheLookupScheduler::add_action"); |
268 | 0 | let maybe_pending_txs = { |
269 | 0 | let mut inflight_cache_checks = inflight_cache_checks.lock(); |
270 | 0 | // We are ready to resolve the in-flight actions. We remove the |
271 | 0 | // in-flight actions from the map. |
272 | 0 | inflight_cache_checks.remove(unique_key) |
273 | | }; |
274 | 0 | let Some(pending_txs) = maybe_pending_txs else { Branch (274:29): [True: 0, False: 0]
Branch (274:29): [Folded - Ignored]
|
275 | 0 | return; // Nobody is waiting for this action anymore. |
276 | | }; |
277 | 0 | for (_client_operation_id, pending_tx) in pending_txs { |
278 | 0 | // Ignore errors here, as the other end may have hung up. |
279 | 0 | let _ = pending_tx.send(Err(err.clone())); |
280 | 0 | } |
281 | 0 | return; |
282 | 0 | } |
283 | 0 | } |
284 | 0 | } |
285 | 0 |
|
286 | 0 | let maybe_pending_txs = { |
287 | 0 | let mut inflight_cache_checks = inflight_cache_checks.lock(); |
288 | 0 | inflight_cache_checks.remove(unique_key) |
289 | | }; |
290 | 0 | let Some(pending_txs) = maybe_pending_txs else { Branch (290:17): [True: 0, False: 0]
Branch (290:17): [Folded - Ignored]
|
291 | 0 | return; // Noone is waiting for this action anymore. |
292 | | }; |
293 | | |
294 | 0 | for (client_operation_id, pending_tx) in pending_txs { |
295 | | // Ignore errors here, as the other end may have hung up. |
296 | 0 | let _ = pending_tx.send( |
297 | 0 | action_scheduler |
298 | 0 | .add_action(client_operation_id, action_info.clone()) |
299 | 0 | .await, |
300 | | ); |
301 | | } |
302 | 0 | }); |
303 | 0 | action_listener_rx |
304 | 0 | .await |
305 | 0 | .map_err(|_| { |
306 | 0 | make_err!( |
307 | 0 | Code::Internal, |
308 | 0 | "ActionStateResult tx hung up in CacheLookupScheduler::add_action" |
309 | 0 | ) |
310 | 0 | })? |
311 | 0 | .err_tip(|| "In CacheLookupScheduler::add_action") |
312 | 1 | } |
313 | | |
314 | 1 | async fn inner_filter_operations( |
315 | 1 | &self, |
316 | 1 | filter: OperationFilter, |
317 | 1 | ) -> Result<ActionStateResultStream, Error> { |
318 | 1 | self.action_scheduler |
319 | 1 | .filter_operations(filter) |
320 | 1 | .await |
321 | 1 | .err_tip(|| "In CacheLookupScheduler::filter_operations"0 ) |
322 | 1 | } |
323 | | } |
324 | | |
325 | | #[async_trait] |
326 | | impl ClientStateManager for CacheLookupScheduler { |
327 | | async fn add_action( |
328 | | &self, |
329 | | client_operation_id: OperationId, |
330 | | action_info: Arc<ActionInfo>, |
331 | 1 | ) -> Result<Box<dyn ActionStateResult>, Error> { |
332 | 1 | self.inner_add_action(client_operation_id, action_info) |
333 | 1 | .await |
334 | 2 | } |
335 | | |
336 | | async fn filter_operations( |
337 | | &self, |
338 | | filter: OperationFilter, |
339 | 1 | ) -> Result<ActionStateResultStream, Error> { |
340 | 1 | self.inner_filter_operations(filter).await |
341 | 2 | } |
342 | | |
343 | 0 | fn as_known_platform_property_provider(&self) -> Option<&dyn KnownPlatformPropertyProvider> { |
344 | 0 | self.action_scheduler.as_known_platform_property_provider() |
345 | 0 | } |
346 | | } |
347 | | |
348 | | impl RootMetricsComponent for CacheLookupScheduler {} |