From 9fafee8208365db1d4c73756a1bef109d8805560 Mon Sep 17 00:00:00 2001 From: Edward Wang Date: Mon, 11 Nov 2024 18:09:55 -0800 Subject: [PATCH] Release cache lock after proxy upstream filter, serve stale The cache locks may be held after serving stale, proxy upstream filter, or revalidate uncacheable resulting in dangling cache locks. Also only disable cache on final error if cache was not already disabled, and add DeclinedToUpstream / UpstreamError no cache reasons. --- .bleep | 2 +- pingora-cache/src/lib.rs | 70 +++++++++++++++++++++----------- pingora-cache/src/predictor.rs | 2 +- pingora-proxy/src/lib.rs | 17 +++++++- pingora-proxy/src/proxy_cache.rs | 17 +++++++- pingora-proxy/src/proxy_h2.rs | 1 + 6 files changed, 81 insertions(+), 28 deletions(-) diff --git a/.bleep b/.bleep index e78b1cb9..e4a53810 100644 --- a/.bleep +++ b/.bleep @@ -1 +1 @@ -4cba0f37f5682058c05904a3c5a395919a8ac498 \ No newline at end of file +89319e9383d6f99066dfeace750a553d45e1c167 \ No newline at end of file diff --git a/pingora-cache/src/lib.rs b/pingora-cache/src/lib.rs index 6bebbd31..6ac49a59 100644 --- a/pingora-cache/src/lib.rs +++ b/pingora-cache/src/lib.rs @@ -125,8 +125,12 @@ pub enum NoCacheReason { /// /// This happens when the cache predictor predicted that this request is not cacheable, but /// the response turns out to be OK to cache. However, it might be too large to re-enable caching - /// for this request. + /// for this request Deferred, + /// Due to the proxy upstream filter declining the current request from going upstream + DeclinedToUpstream, + /// Due to the upstream being unreachable or otherwise erroring during proxying + UpstreamError, /// The writer of the cache lock sees that the request is not cacheable (Could be OriginNotCache) CacheLockGiveUp, /// This request waited too long for the writer of the cache lock to finish, so this request will @@ -147,6 +151,8 @@ impl NoCacheReason { StorageError => "StorageError", InternalError => "InternalError", Deferred => "Deferred", + DeclinedToUpstream => "DeclinedToUpstream", + UpstreamError => "UpstreamError", CacheLockGiveUp => "CacheLockGiveUp", CacheLockTimeout => "CacheLockTimeout", Custom(s) => s, @@ -299,9 +305,44 @@ impl HttpCache { .is_some() } + /// Release the cache lock if the current request is a cache writer. + /// + /// Generally callers should prefer using `disable` when a cache lock should be released + /// due to an error to clear all cache context. This function is for releasing the cache lock + /// while still keeping the cache around for reading, e.g. when serving stale. + pub fn release_write_lock(&mut self, reason: NoCacheReason) { + use NoCacheReason::*; + if let Some(inner) = self.inner.as_mut() { + let lock = inner.lock.take(); + if let Some(Locked::Write(_r)) = lock { + let lock_status = match reason { + // let the next request try to fetch it + InternalError | StorageError | Deferred | UpstreamError => { + LockStatus::TransientError + } + // depends on why the proxy upstream filter declined the request, + // for now still allow next request try to acquire to avoid thundering herd + DeclinedToUpstream => LockStatus::TransientError, + // no need for the lock anymore + OriginNotCache | ResponseTooLarge => LockStatus::GiveUp, + // not sure which LockStatus make sense, we treat it as GiveUp for now + Custom(_) => LockStatus::GiveUp, + // should never happen, NeverEnabled shouldn't hold a lock + NeverEnabled => panic!("NeverEnabled holds a write lock"), + CacheLockGiveUp | CacheLockTimeout => { + panic!("CacheLock* are for cache lock readers only") + } + }; + inner + .cache_lock + .unwrap() + .release(inner.key.as_ref().unwrap(), lock_status); + } + } + } + /// Disable caching pub fn disable(&mut self, reason: NoCacheReason) { - use NoCacheReason::*; match self.phase { CachePhase::Disabled(_) => { // replace reason @@ -309,28 +350,7 @@ impl HttpCache { } _ => { self.phase = CachePhase::Disabled(reason); - if let Some(inner) = self.inner.as_mut() { - let lock = inner.lock.take(); - if let Some(Locked::Write(_r)) = lock { - let lock_status = match reason { - // let the next request try to fetch it - InternalError | StorageError | Deferred => LockStatus::TransientError, - // no need for the lock anymore - OriginNotCache | ResponseTooLarge => LockStatus::GiveUp, - // not sure which LockStatus make sense, we treat it as GiveUp for now - Custom(_) => LockStatus::GiveUp, - // should never happen, NeverEnabled shouldn't hold a lock - NeverEnabled => panic!("NeverEnabled holds a write lock"), - CacheLockGiveUp | CacheLockTimeout => { - panic!("CacheLock* are for cache lock readers only") - } - }; - inner - .cache_lock - .unwrap() - .release(inner.key.as_ref().unwrap(), lock_status); - } - } + self.release_write_lock(reason); // log initial disable reason self.inner_mut() .traces @@ -824,6 +844,8 @@ impl HttpCache { CachePhase::Stale => { // replace cache meta header self.inner_mut().meta.as_mut().unwrap().0.header = header; + // upstream request done, release write lock + self.release_write_lock(reason); } _ => panic!("wrong phase {:?}", self.phase), } diff --git a/pingora-cache/src/predictor.rs b/pingora-cache/src/predictor.rs index df8f3743..d131c20d 100644 --- a/pingora-cache/src/predictor.rs +++ b/pingora-cache/src/predictor.rs @@ -120,7 +120,7 @@ where // CacheLockGiveUp: the writer will set OriginNotCache (if applicable) // readers don't need to do it NeverEnabled | StorageError | InternalError | Deferred | CacheLockGiveUp - | CacheLockTimeout => { + | CacheLockTimeout | DeclinedToUpstream | UpstreamError => { return None; } // Skip certain NoCacheReason::Custom according to user diff --git a/pingora-proxy/src/lib.rs b/pingora-proxy/src/lib.rs index f13a1f79..731cdb41 100644 --- a/pingora-proxy/src/lib.rs +++ b/pingora-proxy/src/lib.rs @@ -535,6 +535,10 @@ impl HttpProxy { if !proxy_to_upstream { // The hook can choose to write its own response, but if it doesn't, we respond // with a generic 502 + if session.cache.enabled() { + // drop the cache lock that this request may be holding onto + session.cache.disable(NoCacheReason::DeclinedToUpstream); + } if session.response_written().is_none() { match session.write_response_header_ref(&BAD_GATEWAY).await { Ok(()) => {} @@ -557,6 +561,10 @@ impl HttpProxy { /* else continue */ } Err(e) => { + if session.cache.enabled() { + session.cache.disable(NoCacheReason::InternalError); + } + self.handle_error( &mut session, &mut ctx, @@ -621,7 +629,14 @@ impl HttpProxy { if let Some(e) = final_error.as_ref() { // If we have errored and are still holding a cache lock, release it. - session.cache.disable(NoCacheReason::InternalError); + if session.cache.enabled() { + let reason = if *e.esource() == ErrorSource::Upstream { + NoCacheReason::UpstreamError + } else { + NoCacheReason::InternalError + }; + session.cache.disable(reason); + } let status = self.inner.fail_to_proxy(&mut session, e, &mut ctx).await; // final error will have > 0 status unless downstream connection is dead diff --git a/pingora-proxy/src/proxy_cache.rs b/pingora-proxy/src/proxy_cache.rs index 4369ec8b..8957bc76 100644 --- a/pingora-proxy/src/proxy_cache.rs +++ b/pingora-proxy/src/proxy_cache.rs @@ -667,8 +667,18 @@ impl HttpProxy { None, None, ); - self.inner + if self + .inner .should_serve_stale(session, ctx, Some(&http_status_error)) + { + // no more need to keep the write lock + session + .cache + .release_write_lock(NoCacheReason::UpstreamError); + true + } else { + false + } } else { false // not 304, not stale if error status code } @@ -712,6 +722,11 @@ impl HttpProxy { self.inner.request_summary(session, ctx) ); + // no more need to hang onto the cache lock + session + .cache + .release_write_lock(NoCacheReason::UpstreamError); + Some(self.proxy_cache_hit(session, ctx).await) } diff --git a/pingora-proxy/src/proxy_h2.rs b/pingora-proxy/src/proxy_h2.rs index ba832b24..a27c559b 100644 --- a/pingora-proxy/src/proxy_h2.rs +++ b/pingora-proxy/src/proxy_h2.rs @@ -409,6 +409,7 @@ impl HttpProxy { .cache_http_task(session, &task, ctx, serve_from_cache) .await { + session.cache.disable(NoCacheReason::StorageError); if serve_from_cache.is_miss_body() { // if the response stream cache body during miss but write fails, it has to // give up the entire request