Split the cache purge into chunks to stay under the Cloudflare purge limit

f4887c30 · Chris Coley · 2a528e61 · f4887c30 · f4887c30
Verified Commit f4887c30 authored Jun 16, 2024 by Chris Coley
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -71,8 +71,13 @@ pages:
  rules:
    - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
-# Purge all of this site's URLs from the Cloudflare cache
+# Create a list of URLs that need to be purged from the cache after GitLab Pages
-create-purge-json:
+# deployment. Each real file/directory has multiple URLs because the router for
+# GitLab Pages fudges paths for better user experience. The fudging rules are:
+#  - Directories can be accessed with or without a trailing slash
+#  - Files can be accessed with or without a trailing slash
+#  - Files with the '.html' extension can be accessed with or without .html
+create-purge-list:
  stage: deploy
  rules:
    - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
@@ -80,31 +85,28 @@ create-purge-json:
  tags:
    - docker
  script:
+    # Add the index-less homepage
+    - echo "$CI_PAGES_URL" > purge-list.txt
+    # Add all the HTML files and HTML symlinks, with and without .html extension
    - |-
-      echo -en "{\n  \"files\": [" > purge.json
+      for f in $(find public/* \( -type f -or -type l \) -iname '*.html'); do
-      # Add the index-less homepage, with and without trailing slash
+        f="${f#public/}"
-      echo -en "\n    \"$CI_PAGES_URL\"" >> purge.json
+        echo "$CI_PAGES_URL/$f" >> purge-list.txt
-      echo -en ",\n    \"$CI_PAGES_URL/\"" >> purge.json
+        echo "$CI_PAGES_URL/${f%.html}" >> purge-list.txt
-      # Add all the directories, non-HTML files and non-HTML symlinks; with and
-      # without trailing slash
-      for f in $(find public \( -type d -or -type f -or -type l \) -not -iname '*.html'); do
-        echo -en ",\n    \"$CI_PAGES_URL/${f#public/}\"" >> purge.json
-        echo -en ",\n    \"$CI_PAGES_URL/${f#public/}/\"" >> purge.json
      done
-      # Add all the HTML files and HTML symlinks, with and without trailing
+    # Add everything else
-      # slash. First with file extension, then without
+    - |-
-      for f in $(find public \( -type f -or -type l \) -iname '*.html'); do
+      for f in $(find public/* \( -type d -or -type f -or -type l \) -not -iname '*.html'); do
-        echo -en ",\n    \"$CI_PAGES_URL/${f#public/}\"" >> purge.json
+        echo "$CI_PAGES_URL/${f#public/}" >> purge-list.txt
-        echo -en ",\n    \"$CI_PAGES_URL/${f#public/}/\"" >> purge.json
-        f="${f%.html}"
-        echo -en ",\n    \"$CI_PAGES_URL/${f#public/}\"" >> purge.json
-        echo -en ",\n    \"$CI_PAGES_URL/${f#public/}/\"" >> purge.json
      done
-      echo -e "\n  ]\n}" >> purge.json
+    # Remove any duplicate URLs
-    - cat purge.json
+    - sort -u -o purge-list.txt purge-list.txt
+    # Duplicate each line, adding a trailing slash to the duplicates
+    - sed -i 'p;s|$|/|' purge-list.txt
+    - cat purge-list.txt
  artifacts:
    paths:
-      - purge.json
+      - purge-list.txt
 trigger-cache-purge:
  stage: .post


--- a/purge-cache.gitlab-ci.yml
+++ b/purge-cache.gitlab-ci.yml
-# Purge the Cloudflare cache using the request body contained in purge.json
+# Purge the URLs contained in purge-list.txt from the Cloudflare cache
 #
 # We delay this job to give the pages:deploy job time to finish. If we don't
 # delay, then the cache might refill with old pages before the new pages are
@@ -12,14 +12,37 @@ purge-cache:
      start_in: 3 minutes
  needs:
    - pipeline: $PARENT_PIPELINE_ID
-      job: create-purge-json
+      job: create-purge-list
+  before_script:
+    # Make sure the purge-list.txt file is readable, else exit
+    - test -r purge-list.txt || { echo "purge-list.txt not found" ; exit 1 ; }
+    - echo "Purge list has $(cat purge-list.txt | wc -l) URLs"
+    # Default to chunks of 30 URLs because Cloudflare only allows 30 URLs per
+    # purge request on free accounts
+    - echo "Chunk size of ${CF_PURGE_CACHE_CHUNK_SIZE:=30}"
  script:
-    - cat purge.json
+    # Split the purge list into chunks named 'purge-chunk-[aaa,aab,...]'
-    - >-
+    - split -l $CF_PURGE_CACHE_CHUNK_SIZE -a 3 purge-list.txt purge-chunk-
-      wget -qO- "https://api.cloudflare.com/client/v4/zones/$CF_PURGE_CACHE_ZONE/purge_cache"
+    # Loop over the chunks, creating a purge request for each
-      --header "Content-Type: application/json"
+    - |-
-      --header "Authorization: Bearer $CF_PURGE_CACHE_TOKEN"
+      for chunk in purge-chunk-* ; do
+        # Create the purge request body
+        echo -en "{\n  \"files\": [" > purge.json
+        unset comma # This needs to be unset for the first line in each chunk
+        while read path; do
+          echo -en "$comma\n    \"$path\"" >> purge.json
+          comma=','
+        done < $chunk
+        echo -e "\n  ]\n}" >> purge.json
+        cat purge.json
+        # Make the API request to Cloudflare to purge the URLs from cache
+        wget -qO- "https://api.cloudflare.com/client/v4/zones/$CF_PURGE_CACHE_ZONE/purge_cache" \
+          --header "Content-Type: application/json" \
+          --header "Authorization: Bearer $CF_PURGE_CACHE_TOKEN" \
          --post-file purge.json
+        # Rate limit ourselves to 1 request per second
+        sleep 1
+      done
 # vi: set ts=2 sw=2 et ft=yaml: