From c0e4910d1fe6ef9b6710d88b077b4df2c768f2c1 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Tue, 4 Nov 2025 13:55:31 -0500 Subject: [PATCH 01/19] =?UTF-8?q?pattern=5Frequirements=20for=20rules=20?= =?UTF-8?q?=E2=80=94=20Post-regex=20character-class=20gating=20to=20cut=20?= =?UTF-8?q?false=20positives=20without=20lookarounds.=20Authors=20can=20no?= =?UTF-8?q?w=20require=20minimum=20counts=20of=20digits,=20uppercase,=20lo?= =?UTF-8?q?wercase,=20and=20special=20characters,=20with=20an=20optional?= =?UTF-8?q?=20custom=20special-char=20set.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Why: Hyperscan doesn’t support lookaheads/behinds, so many “must contain X and Y” checks had to be baked into the regex (hurting readability) or were impossible. pattern_requirements applies lightweight, in-memory checks after a match is found, keeping patterns fast and clean. --- CHANGELOG.md | 2 + data/rules/adobe.yml | 4 +- data/rules/age.yml | 7 +- data/rules/ai21.yml | 4 + data/rules/airbrake.yml | 6 +- data/rules/airtable.yml | 7 +- data/rules/aiven.yml | 5 + data/rules/algolia.yml | 9 +- data/rules/alibaba.yml | 7 +- data/rules/anthropic.yml | 11 +- data/rules/anypoint.yml | 8 +- data/rules/apify.yml | 5 +- data/rules/artifactory.yml | 4 + data/rules/asana.yml | 14 +- data/rules/assemblyai.yml | 3 + data/rules/atlassian.yml | 5 +- data/rules/auth0.yml | 5 +- data/rules/authress.yml | 4 + data/rules/aws.yml | 10 +- data/rules/azure.yml | 2 + data/rules/azuredevops.yml | 2 + data/rules/azureopenai.yml | 3 + data/rules/azuresearchquery.yml | 4 + data/rules/azurestorage.yml | 16 +- data/rules/baremetrics.yml | 4 +- data/rules/baseten.yml | 4 + data/rules/beamer.yml | 7 +- data/rules/bitbucket.yml | 6 +- data/rules/bitly.yml | 4 +- data/rules/blynk.yml | 20 +++ data/rules/buildkite.yml | 4 +- data/rules/cerebras.yml | 4 +- data/rules/circleci.yml | 9 +- data/rules/ciscomeraki.yml | 3 +- data/rules/clarifai.yml | 2 + data/rules/clearbit.yml | 2 + data/rules/clickhouse.yml | 6 +- data/rules/clojars.yml | 6 +- data/rules/cloudflare.yml | 14 +- data/rules/cloudsight.yml | 6 +- data/rules/codacy.yml | 3 + data/rules/codeclimate.yml | 3 + data/rules/codecov.yml | 3 + data/rules/cohere.yml | 3 + data/rules/coinbase.yml | 4 +- data/rules/confluent.yml | 3 + data/rules/contentful.yml | 9 +- data/rules/coze.yml | 13 +- data/rules/crates.io.yml | 2 + data/rules/databricks.yml | 9 +- data/rules/datadog.yml | 8 +- data/rules/deepgram.yml | 4 +- data/rules/deepseek.yml | 4 +- data/rules/dependency_track.yml | 2 + data/rules/diffbot.yml | 2 + data/rules/digitalocean.yml | 4 +- data/rules/discord.yml | 7 +- data/rules/django.yml | 1 - data/rules/docker.yml | 2 + data/rules/dockerhub.yml | 2 + data/rules/doppler.yml | 6 +- data/rules/droneci.yml | 2 + data/rules/dropbox.yml | 5 +- data/rules/duffel.yml | 2 + data/rules/dynatrace.yml | 2 + data/rules/easypost.yml | 4 +- data/rules/elevenlabs.yml | 6 +- data/rules/facebook.yml | 15 +- data/rules/fastly.yml | 2 + data/rules/figma.yml | 6 +- data/rules/fileio.yml | 2 + data/rules/finicity.yml | 2 + data/rules/finnhub.yml | 2 + data/rules/firecrawl.yml | 2 + data/rules/fireworksai.yml | 2 + data/rules/flickr.yml | 6 +- data/rules/flyio.yml | 2 + data/rules/frame.io.yml | 6 +- data/rules/frameio.yml | 3 + data/rules/freshbooks.yml | 2 + data/rules/friendli.yml | 2 + data/rules/gcp.yml | 10 +- data/rules/generic.yml | 21 ++- data/rules/gitalk.yml | 2 + data/rules/github.yml | 14 +- data/rules/gitlab.yml | 16 +- data/rules/gitter.yml | 2 + data/rules/gocardless.yml | 6 +- data/rules/google.yml | 30 +++- data/rules/grafana.yml | 9 +- data/rules/groq.yml | 2 + data/rules/hashes.yml | 11 +- data/rules/hashicorp.yml | 30 ++-- data/rules/heroku.yml | 7 +- data/rules/hubspot.yml | 2 + data/rules/huggingface.yml | 3 +- data/rules/ibm.yml | 5 +- data/rules/infracost.yml | 2 + data/rules/intercom.yml | 3 + data/rules/ionic.yml | 6 +- data/rules/ipstack.yml | 2 + data/rules/jenkins.yml | 5 +- data/rules/jina.yml | 6 +- data/rules/jira.yml | 13 +- data/rules/jwt.yml | 5 +- data/rules/kagi.yml | 4 +- data/rules/kickbox.yml | 2 + data/rules/langchain.yml | 4 + data/rules/launchdarkly.yml | 2 + data/rules/line.yml | 2 + data/rules/linear.yml | 6 +- data/rules/linkedin.yml | 4 + data/rules/lob.yml | 4 + data/rules/mailchimp.yml | 3 + data/rules/mailgun.yml | 8 +- data/rules/mandrill.yml | 6 +- data/rules/mapbox.yml | 6 + data/rules/mattermost.yml | 2 + data/rules/maxmind.yml | 4 +- data/rules/messagebird.yml | 3 + data/rules/microsoft_teams.yml | 2 + data/rules/microsoftteamswebhook.yml | 28 ++-- data/rules/mistral.yml | 2 + data/rules/mongodb.yml | 9 +- data/rules/nasa.yml | 3 + data/rules/netlify.yml | 8 +- data/rules/newrelic.yml | 3 + data/rules/ngrok.yml | 1 - data/rules/notion.yml | 6 +- data/rules/npm.yml | 3 +- data/rules/nuget.yml | 4 +- data/rules/nvidia.yml | 2 + data/rules/nytimes.yml | 2 + data/rules/okta.yml | 4 +- data/rules/ollama.yml | 2 + data/rules/onepassword.yml | 9 +- data/rules/openai.yml | 4 + data/rules/openweathermap.yml | 3 +- data/rules/opsgenie.yml | 3 + data/rules/pagerdutyapikey.yml | 3 +- data/rules/particle.io.yml | 8 +- data/rules/pastebin.yml | 2 + data/rules/paypal.yml | 109 ++++++------- data/rules/pem.yml | 4 + data/rules/perplexity.yml | 2 + data/rules/planetscale.yml | 6 +- data/rules/postman.yml | 2 + data/rules/postmark.yml | 2 + data/rules/prefect.yml | 2 + data/rules/privkey.yml | 8 +- data/rules/psexec.yml | 1 - data/rules/pubnub.yml | 4 + data/rules/pulumi.yml | 2 + data/rules/pypi.yml | 2 + data/rules/rabbitmq.yml | 3 +- data/rules/react.yml | 7 - data/rules/readme.yml | 5 +- data/rules/recaptcha.yml | 3 + data/rules/replicate.yml | 6 +- data/rules/rubygems.yml | 6 +- data/rules/runway.yml | 4 +- data/rules/salesforce.yml | 4 +- data/rules/sauce.yml | 4 +- data/rules/scale.yml | 10 +- data/rules/scalingo.yml | 3 +- data/rules/segment.yml | 6 +- data/rules/sendbird.yml | 4 + data/rules/sendgrid.yml | 3 +- data/rules/sendinblue.yml | 2 + data/rules/sentry.yml | 7 + data/rules/shippo.yml | 2 + data/rules/shodan.yml | 2 + data/rules/shopify.yml | 2 + data/rules/slack.yml | 10 +- data/rules/snyk.yml | 4 +- data/rules/sonarcloud.yml | 2 + data/rules/sonarqube.yml | 5 +- data/rules/sourcegraph.yml | 8 +- data/rules/square.yml | 23 ++- data/rules/stabilityai.yml | 4 +- data/rules/stackhawk.yml | 2 + data/rules/stripe.yml | 12 +- data/rules/supabase.yml | 8 +- data/rules/tailscale.yml | 2 + data/rules/tavily.yml | 2 + data/rules/teamcity.yml | 2 + data/rules/telegram.yml | 2 + data/rules/thingsboard.yml | 20 ++- data/rules/togetherai.yml | 3 + data/rules/travisci.yml | 4 +- data/rules/truenas.yml | 4 + data/rules/twilio.yml | 8 +- data/rules/twitch.yml | 5 + data/rules/twitter.yml | 8 +- data/rules/typeform.yml | 2 + data/rules/vercel.yml | 4 + data/rules/vmware.yml | 1 - data/rules/weightsandbiases.yml | 4 +- data/rules/wireguard.yml | 2 + data/rules/xAI.yml | 4 +- data/rules/zhipu.yml | 2 + docs/RULES.md | 113 ++++++++++++-- src/baseline.rs | 1 + src/matcher.rs | 18 +++ src/reporter.rs | 1 + src/reporter/json_format.rs | 1 + src/rules/rule.rs | 223 +++++++++++++++++++++++++++ testdata/test_char_requirements.yaml | 24 +++ tests/fingerprint_dedup.rs | 1 + 209 files changed, 1267 insertions(+), 299 deletions(-) create mode 100644 testdata/test_char_requirements.yaml diff --git a/CHANGELOG.md b/CHANGELOG.md index 7e23102..8555b69 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ All notable changes to this project will be documented in this file. +## [Unrelease] +- pattern_requirements for rules — Post-regex character-class gating to cut false positives without lookarounds. Authors can now require minimum counts of digits, uppercase, lowercase, and special characters, with an optional custom special-char set. Why? Hyperscan doesn’t support lookaheads/behinds, so many "must contain X and Y" checks had to be baked into the regex (hurting readability) or were impossible. `pattern_requirements` applies lightweight, in-memory checks after a match is found, keeping patterns fast and clean. ## [v1.61.0] - Fixed local filesystem scans to keep `open_path_as_is` enabled when opening Git repositories and only disable it for diff-based scans. - Created Linux and Windows specific installer script diff --git a/data/rules/adobe.yml b/data/rules/adobe.yml index 03b54d0..09ec783 100644 --- a/data/rules/adobe.yml +++ b/data/rules/adobe.yml @@ -6,11 +6,13 @@ rules: \b adobe (?:.|[\n\r]){0,32}? - \b + \b ( [A-F0-9]{32} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 examples: - adobeKey = 1a2b3c4d5e6f7890abcdef1234567890 diff --git a/data/rules/age.yml b/data/rules/age.yml index f296a26..e024a63 100644 --- a/data/rules/age.yml +++ b/data/rules/age.yml @@ -3,11 +3,14 @@ rules: id: kingfisher.age.1 pattern: | (?xi) - \b ( age1[0-9a-z]{58} ) \b + pattern_requirements: + min_digits: 2 + min_uppercase: 1 + min_lowercase: 1 min_entropy: 3.3 confidence: medium examples: @@ -21,11 +24,9 @@ rules: id: kingfisher.age.2 pattern: | (?xi) - \b ( AGE-SECRET-KEY-1[0-9A-Z]{58} ) - \b min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/ai21.yml b/data/rules/ai21.yml index 7a400eb..899e91c 100644 --- a/data/rules/ai21.yml +++ b/data/rules/ai21.yml @@ -19,6 +19,10 @@ rules: [0-9a-f]{12} ) \b + pattern_requirements: + min_digits: 2 + min_uppercase: 1 + min_lowercase: 1 min_entropy: 3.2 confidence: medium examples: diff --git a/data/rules/airbrake.yml b/data/rules/airbrake.yml index d54d261..8a25076 100644 --- a/data/rules/airbrake.yml +++ b/data/rules/airbrake.yml @@ -9,7 +9,11 @@ rules: ( [A-Z0-9-]{40} ) - \b + \b + pattern_requirements: + min_digits: 2 + min_uppercase: 1 + min_lowercase: 1 min_entropy: 4.5 confidence: medium examples: diff --git a/data/rules/airtable.yml b/data/rules/airtable.yml index 7b8b037..abb100d 100644 --- a/data/rules/airtable.yml +++ b/data/rules/airtable.yml @@ -10,7 +10,11 @@ rules: \. [a-z0-9]{62,66} ) - \b + \b + pattern_requirements: + min_digits: 2 + min_uppercase: 1 + min_lowercase: 1 min_entropy: 3.3 confidence: medium examples: @@ -37,7 +41,6 @@ rules: id: kingfisher.airtable.2 pattern: | (?xi) - \b ( [A-Z0-9]+\.v1\.[A-Z0-9_-]+\.[a-f0-9]+ ) diff --git a/data/rules/aiven.yml b/data/rules/aiven.yml index 09a7269..de6fdae 100644 --- a/data/rules/aiven.yml +++ b/data/rules/aiven.yml @@ -9,6 +9,11 @@ rules: ( [a-z0-9/+=]{372} ) + \b + pattern_requirements: + min_digits: 2 + min_uppercase: 1 + min_lowercase: 1 min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/algolia.yml b/data/rules/algolia.yml index dd65cf7..150cda9 100644 --- a/data/rules/algolia.yml +++ b/data/rules/algolia.yml @@ -5,11 +5,13 @@ rules: (?xi) algolia (?:.|[\n\r]){0,32}? - \b ( [a-z0-9]{32} ) \b + pattern_requirements: + min_digits: 2 + min_lowercase: 1 min_entropy: 3.5 confidence: medium examples: @@ -38,11 +40,12 @@ rules: (?xi) algolia (?:.|[\n\r]){0,16}? - \b ( [A-Z0-9]{10} ) - \b + \b + pattern_requirements: + min_digits: 2 min_entropy: 2.0 visible: false confidence: medium diff --git a/data/rules/alibaba.yml b/data/rules/alibaba.yml index b4807ec..990ab91 100644 --- a/data/rules/alibaba.yml +++ b/data/rules/alibaba.yml @@ -3,11 +3,14 @@ rules: id: kingfisher.alibabacloud.1 pattern: | (?xi) - \b ( LTAI[a-z0-9]{17,21} ) \b + pattern_requirements: + min_digits: 2 + min_uppercase: 1 + min_lowercase: 1 min_entropy: 4.0 confidence: medium visible: false @@ -21,11 +24,9 @@ rules: \b alibaba (?:.|[\n\r]){0,32}? - \b ( [a-z0-9]{30} ) - \b min_entropy: 4.2 confidence: medium examples: diff --git a/data/rules/anthropic.yml b/data/rules/anthropic.yml index 4f51c88..8907056 100644 --- a/data/rules/anthropic.yml +++ b/data/rules/anthropic.yml @@ -10,15 +10,16 @@ rules: - [\w\-]{93} AA - ) - \b + ) + \b + pattern_requirements: + min_digits: 2 + min_uppercase: 1 + min_lowercase: 1 min_entropy: 3.3 confidence: medium examples: - sk-ant-api668-Clm512odot9WDD7itfUU9R880nefA1EtYZDbpE-C9b0XQEWpqFKf9DQUo03vOfXl16oSmyar1CLF1SzV3YzpZJ6bahcpLAA - categories: - - api - - secret references: - https://docs.anthropic.com/claude/reference/authentication validation: diff --git a/data/rules/anypoint.yml b/data/rules/anypoint.yml index 4b3b8ab..d7d420b 100644 --- a/data/rules/anypoint.yml +++ b/data/rules/anypoint.yml @@ -18,8 +18,12 @@ rules: [0-9a-z]{4} - [0-9a-z]{12} - ) - \b + ) + \b + pattern_requirements: + min_digits: 2 + min_uppercase: 1 + min_lowercase: 1 min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/apify.yml b/data/rules/apify.yml index 78c99dd..1cf959c 100644 --- a/data/rules/apify.yml +++ b/data/rules/apify.yml @@ -3,11 +3,14 @@ rules: id: kingfisher.apify.1 pattern: | (?xi) - \b ( apify_api_[A-Z0-9]{34,38} ) \b + pattern_requirements: + min_digits: 2 + min_uppercase: 1 + min_lowercase: 1 confidence: medium min_entropy: 3.5 validation: diff --git a/data/rules/artifactory.yml b/data/rules/artifactory.yml index f53174e..ff172e3 100644 --- a/data/rules/artifactory.yml +++ b/data/rules/artifactory.yml @@ -8,6 +8,10 @@ rules: AKC[A-Z0-9]{64,74} ) \b + pattern_requirements: + min_digits: 2 + min_uppercase: 1 + min_lowercase: 1 min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/asana.yml b/data/rules/asana.yml index 85e30c5..2d43c31 100644 --- a/data/rules/asana.yml +++ b/data/rules/asana.yml @@ -10,7 +10,9 @@ rules: ( [0-9]{16} ) - \b + \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium examples: @@ -30,7 +32,11 @@ rules: ( [a-z0-9]{30,40} ) - \b + \b + pattern_requirements: + min_digits: 2 + min_uppercase: 1 + min_lowercase: 1 min_entropy: 3.5 confidence: medium examples: @@ -54,7 +60,9 @@ rules: [a-z0-9]{32,64} )? ) - \b + \b + pattern_requirements: + min_digits: 4 min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/assemblyai.yml b/data/rules/assemblyai.yml index 3c81111..38136d0 100644 --- a/data/rules/assemblyai.yml +++ b/data/rules/assemblyai.yml @@ -11,6 +11,9 @@ rules: [0-9a-z]{32} ) \b + pattern_requirements: + min_digits: 2 + min_lowercase: 1 min_entropy: 3.0 confidence: medium examples: diff --git a/data/rules/atlassian.yml b/data/rules/atlassian.yml index 32dccb2..5d198ee 100644 --- a/data/rules/atlassian.yml +++ b/data/rules/atlassian.yml @@ -10,7 +10,10 @@ rules: ( [a-z0-9]{24} ) - \b + \b + pattern_requirements: + min_digits: 2 + min_lowercase: 1 min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/auth0.yml b/data/rules/auth0.yml index 39a0b43..f77497f 100644 --- a/data/rules/auth0.yml +++ b/data/rules/auth0.yml @@ -10,7 +10,10 @@ rules: ( [a-z0-9_-]{32,60} ) - \b + \b + pattern_requirements: + min_digits: 2 + min_lowercase: 1 min_entropy: 3.5 confidence: medium visible: false diff --git a/data/rules/authress.yml b/data/rules/authress.yml index 163d979..1df4922 100644 --- a/data/rules/authress.yml +++ b/data/rules/authress.yml @@ -6,6 +6,10 @@ rules: ( (?:sc|ext|scauth|authress)_[a-z0-9]{5,30}\.[a-z0-9]{4,6}\.acc[_-][a-z0-9-]{10,32}\.[a-z0-9+/_=-]{30,120} ) + \b + pattern_requirements: + min_digits: 2 + min_lowercase: 1 confidence: medium min_entropy: 4.0 validation: diff --git a/data/rules/aws.yml b/data/rules/aws.yml index f5fcf4a..6cd938a 100644 --- a/data/rules/aws.yml +++ b/data/rules/aws.yml @@ -3,12 +3,14 @@ rules: id: kingfisher.aws.1 pattern: | (?xi) - \b + \b ( (?:A3T[A-Z0-9]|AKIA|AGPA|AIDA|AROA|AIPA|ANPA|ANVA|ASIA) [2-7A-Z]{16} ) - \b + \b + pattern_requirements: + min_digits: 2 min_entropy: 3.2 visible: false confidence: medium @@ -37,6 +39,8 @@ rules: ([A-Za-z0-9/+]{40}) \b ) + pattern_requirements: + min_digits: 2 min_entropy: 4.5 confidence: medium examples: @@ -60,6 +64,8 @@ rules: - name: AWS Session Token id: kingfisher.aws.4 pattern: '(?i)(?:aws.?session|aws.?session.?token|aws.?token)["''`]?\s{0,30}(?::|=>|=)\s{0,30}["''`]?([a-z0-9/+=]{16,200})[^a-z0-9/+=]' + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/azure.yml b/data/rules/azure.yml index 94a9902..1a1d650 100644 --- a/data/rules/azure.yml +++ b/data/rules/azure.yml @@ -90,6 +90,8 @@ rules: [A-Z0-9+/]{42}\+ACR[A-Z0-9]{6} ) \b + pattern_requirements: + min_digits: 2 confidence: medium min_entropy: 4.0 validation: diff --git a/data/rules/azuredevops.yml b/data/rules/azuredevops.yml index a607bc9..8a21d80 100644 --- a/data/rules/azuredevops.yml +++ b/data/rules/azuredevops.yml @@ -24,6 +24,8 @@ rules: [a-z0-9]{75,76}AZDO[a-z0-9]{4,5} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3 confidence: medium examples: diff --git a/data/rules/azureopenai.yml b/data/rules/azureopenai.yml index 87e8127..51cba8c 100644 --- a/data/rules/azureopenai.yml +++ b/data/rules/azureopenai.yml @@ -14,6 +14,9 @@ rules: [a-f0-9]{32} ) \b + pattern_requirements: + min_digits: 2 + min_lowercase: 2 min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/azuresearchquery.yml b/data/rules/azuresearchquery.yml index 45b84b3..6e37d82 100644 --- a/data/rules/azuresearchquery.yml +++ b/data/rules/azuresearchquery.yml @@ -12,6 +12,10 @@ rules: [0-9A-Z]{52} ) \b + pattern_requirements: + min_digits: 2 + min_uppercase: 1 + min_lowercase: 2 min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/azurestorage.yml b/data/rules/azurestorage.yml index aea15a9..75698f4 100644 --- a/data/rules/azurestorage.yml +++ b/data/rules/azurestorage.yml @@ -33,13 +33,25 @@ rules: (?:.|[\n\r]){0,128}? (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN) (?:.|[\n\r]){0,128}? + ["':\s=}\]\)] ( - [A-Z0-9+\\/-]{86,88}={0,2} + (?: + [A-Z0-9+\-]{86,88}={1,2} + ) + | + (?: + [A-Z0-9+\-]{86,88}\b + ) ) + pattern_requirements: + min_digits: 2 + min_uppercase: 2 + min_lowercase: 2 min_entropy: 4.0 confidence: medium examples: - - Azure AccountKey=Xy9aB8cD7eF6gH5iJ4kL3mN2oP1qR0sT9uV8wX7yZ6aB5cD4eF3gH2iJ1kL0mN9oP8qR7sT6uV5wX4yZ3aB2cD1eF0gH9iJ8kL7mN6oP5q==\ + - Azure AccountKey=Xy9aB8cD7eF6gH5iJ4kL3mN2oP1qR0sT9uV8wX7yZ6aB5cD4eF3gH2iJ1kL0mN9oP8qR7sT6uV5wX4yZ3aB2cD1q + - Azure AccountKey=Xy9aB8cD7eF6gH5iJ4kL3mN2oP1qR0sT9uV8wX7yZ6aB5cD4eF3gH2iJ1kL0mN9oP8qR7sT6uV5wX4yZ3aB2cD1g==\ validation: type: AzureStorage depends_on_rule: diff --git a/data/rules/baremetrics.yml b/data/rules/baremetrics.yml index ce0f37a..2844cd3 100644 --- a/data/rules/baremetrics.yml +++ b/data/rules/baremetrics.yml @@ -10,7 +10,9 @@ rules: ( [a-z0-9_-]{25} ) - \b + \b + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium references: diff --git a/data/rules/baseten.yml b/data/rules/baseten.yml index 8773d6c..b3ba12c 100644 --- a/data/rules/baseten.yml +++ b/data/rules/baseten.yml @@ -13,6 +13,10 @@ rules: [A-Za-z0-9]{32} ) \b + pattern_requirements: + min_digits: 2 + min_uppercase: 1 + min_lowercase: 1 min_entropy: 3.4 confidence: medium examples: diff --git a/data/rules/beamer.yml b/data/rules/beamer.yml index f051ff2..bbc2029 100644 --- a/data/rules/beamer.yml +++ b/data/rules/beamer.yml @@ -9,7 +9,12 @@ rules: \b ( b_[A-Z0-9=_\\/\\\-+]{44} - ) + ) + \b + pattern_requirements: + min_digits: 2 + min_uppercase: 1 + min_lowercase: 1 min_entropy: 3.0 confidence: medium examples: diff --git a/data/rules/bitbucket.yml b/data/rules/bitbucket.yml index ad7e74a..61370fa 100644 --- a/data/rules/bitbucket.yml +++ b/data/rules/bitbucket.yml @@ -8,9 +8,10 @@ rules: (?:.|[\n\r]){0,16}? (?:client|id) (?:.|[\n\r]){0,16}? - \b ([a-z0-9]{30,40}) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium examples: @@ -43,6 +44,9 @@ rules: ( [a-z0-9+_\-+]{44} ) + \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/bitly.yml b/data/rules/bitly.yml index 3f4ca02..df79eb3 100644 --- a/data/rules/bitly.yml +++ b/data/rules/bitly.yml @@ -8,11 +8,13 @@ rules: (?:.|[\n\r]){0,32}? (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN) (?:.|[\n\r]){0,32}? - \b ( [a-f0-9]{40} ) \b + pattern_requirements: + min_digits: 2 + min_lowercase: 2 confidence: medium min_entropy: 3.0 validation: diff --git a/data/rules/blynk.yml b/data/rules/blynk.yml index 4af12c6..1f58e85 100644 --- a/data/rules/blynk.yml +++ b/data/rules/blynk.yml @@ -6,6 +6,10 @@ rules: https://(?:fra1\.|lon1\.|ny3\.|sgp1\.|blr1\.)*blynk\.cloud/external/api/[A-Z0-9/]*\?token= ([A-Z0-9_\-]{32}) & + pattern_requirements: + min_digits: 2 + min_uppercase: 1 + min_lowercase: 1 min_entropy: 3.3 confidence: medium examples: @@ -21,6 +25,10 @@ rules: -H\s*"Authorization:\s*Bearer\s* ([A-Z0-9_\-]{40}) " + pattern_requirements: + min_digits: 2 + min_uppercase: 1 + min_lowercase: 1 min_entropy: 3.3 confidence: medium examples: @@ -35,6 +43,10 @@ rules: -H\s*"Authorization:\s*Bearer\s* ([A-Z0-9_\-]{40}) "[\s\\]*https://(?:fra1\.|lon1\.|ny3\.|sgp1\.|blr1\.)*blynk\.cloud/api + pattern_requirements: + min_digits: 2 + min_uppercase: 1 + min_lowercase: 1 min_entropy: 3.3 confidence: medium examples: @@ -50,6 +62,10 @@ rules: (oa2-client-id_[A-Z0-9_\-]{32}) (?: : | &client_secret= ) ([A-Z0-9_\-]{40}) + pattern_requirements: + min_digits: 2 + min_uppercase: 1 + min_lowercase: 1 min_entropy: 3.3 confidence: medium examples: @@ -66,6 +82,10 @@ rules: (oa2-client-id_[A-Z0-9_\-]{32}) :([A-Z0-9_\-]{40}) [\s\\]*https://(fra1\.|lon1\.|ny3\.|sgp1\.|blr1\.)*blynk\.cloud/oauth2 + pattern_requirements: + min_digits: 2 + min_uppercase: 1 + min_lowercase: 1 min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/buildkite.yml b/data/rules/buildkite.yml index 3728e98..045fa94 100644 --- a/data/rules/buildkite.yml +++ b/data/rules/buildkite.yml @@ -3,11 +3,13 @@ rules: id: kingfisher.buildkite.1 pattern: | (?xi) - \b ( bkua_[a-z0-9]{40} ) \b + pattern_requirements: + min_digits: 2 + min_lowercase: 1 min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/cerebras.yml b/data/rules/cerebras.yml index 73760a6..044506e 100644 --- a/data/rules/cerebras.yml +++ b/data/rules/cerebras.yml @@ -3,11 +3,13 @@ rules: id: kingfisher.cerebras.1 pattern: | (?xi) - \b ( csk-[a-z0-9]{48} ) \b + pattern_requirements: + min_digits: 2 + min_lowercase: 2 confidence: medium min_entropy: 3.0 validation: diff --git a/data/rules/circleci.yml b/data/rules/circleci.yml index f3f2d2a..da20a80 100644 --- a/data/rules/circleci.yml +++ b/data/rules/circleci.yml @@ -20,7 +20,9 @@ rules: _ [a-z0-9]{40} ) - \b + \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium examples: @@ -56,7 +58,10 @@ rules: ( [a-f0-9]{40} ) - \b + \b + pattern_requirements: + min_digits: 2 + min_lowercase: 2 min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/ciscomeraki.yml b/data/rules/ciscomeraki.yml index a8cc4f1..03f9d3a 100644 --- a/data/rules/ciscomeraki.yml +++ b/data/rules/ciscomeraki.yml @@ -5,11 +5,12 @@ rules: (?xi) meraki (?:.|[\n\r]){0,32}? - \b ( [0-9a-f]{40} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/clarifai.yml b/data/rules/clarifai.yml index d72c9f2..0942f19 100644 --- a/data/rules/clarifai.yml +++ b/data/rules/clarifai.yml @@ -11,6 +11,8 @@ rules: [0-9a-f]{32,36} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.0 confidence: medium examples: diff --git a/data/rules/clearbit.yml b/data/rules/clearbit.yml index caa9e7a..02e4469 100644 --- a/data/rules/clearbit.yml +++ b/data/rules/clearbit.yml @@ -13,6 +13,8 @@ rules: [0-9a-z_]{35} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/clickhouse.yml b/data/rules/clickhouse.yml index 57317b9..9f22697 100644 --- a/data/rules/clickhouse.yml +++ b/data/rules/clickhouse.yml @@ -8,6 +8,8 @@ rules: 4b1d[a-z0-9]{38} ) \b + pattern_requirements: + min_digits: 2 confidence: medium min_entropy: 3.5 examples: @@ -43,11 +45,11 @@ rules: (?:.|[\n\r]){0,16}? (?:ID|USER) (?:.|[\n\r]){0,16}? - \b ( [a-z0-9]{20} ) - \b + pattern_requirements: + min_digits: 2 confidence: medium visible: false min_entropy: 3.0 diff --git a/data/rules/clojars.yml b/data/rules/clojars.yml index 58fb19d..5dfaf53 100644 --- a/data/rules/clojars.yml +++ b/data/rules/clojars.yml @@ -10,9 +10,11 @@ rules: (?:.|[\n\r]){0,16}? \b ( - [a-zA-Z0-9_-]{3,} + [a-z0-9_-]{3,} ) \b + pattern_requirements: + min_digits: 2 confidence: medium min_entropy: 1.5 visible: false @@ -28,6 +30,8 @@ rules: CLOJARS_[a-z0-9]{60} ) \b + pattern_requirements: + min_digits: 2 confidence: medium min_entropy: 4.0 examples: diff --git a/data/rules/cloudflare.yml b/data/rules/cloudflare.yml index 776c1bd..aaf9d34 100644 --- a/data/rules/cloudflare.yml +++ b/data/rules/cloudflare.yml @@ -12,7 +12,11 @@ rules: ( [a-z0-9_-]{38,42} ) - \b + \b + pattern_requirements: + min_digits: 2 + min_uppercase: 1 + min_lowercase: 1 min_entropy: 3.5 confidence: medium examples: @@ -47,8 +51,9 @@ rules: ( v1\.0-[a-z0-9._-]{160,} ) - ["'`]? - \b + \b + pattern_requirements: + min_digits: 2 min_entropy: 4.5 confidence: medium examples: @@ -58,9 +63,6 @@ rules: references: - https://developers.cloudflare.com/api/keys/ - https://developers.cloudflare.com/fundamentals/api/get-started/keys/ - categories: - - api - - secret validation: type: Http content: diff --git a/data/rules/cloudsight.yml b/data/rules/cloudsight.yml index b06e922..f3bd7f6 100644 --- a/data/rules/cloudsight.yml +++ b/data/rules/cloudsight.yml @@ -11,8 +11,10 @@ rules: \b ( [a-z0-9]{20,24} - ) - \b + ) + \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/codacy.yml b/data/rules/codacy.yml index 121dbf4..e0a68d5 100644 --- a/data/rules/codacy.yml +++ b/data/rules/codacy.yml @@ -12,6 +12,9 @@ rules: ( [0-9A-Z]{20,24} ) + \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/codeclimate.yml b/data/rules/codeclimate.yml index 677cbc5..aeaa805 100644 --- a/data/rules/codeclimate.yml +++ b/data/rules/codeclimate.yml @@ -5,10 +5,13 @@ rules: (?xi) (?: CODECLIMATE| CC_TEST_REPORTER_ID) (?:.|[\n\r]){0,64}? + \b ( [a-f0-9]{64} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/codecov.yml b/data/rules/codecov.yml index 1bc5e7a..a2ab1e7 100644 --- a/data/rules/codecov.yml +++ b/data/rules/codecov.yml @@ -8,10 +8,13 @@ rules: (?:.|[\n\r]){0,32}? (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN) (?:.|[\n\r]){0,32}? + \b ( [A-Z0-9-]{36} ) \b + pattern_requirements: + min_digits: 2 confidence: medium min_entropy: 3.5 validation: diff --git a/data/rules/cohere.yml b/data/rules/cohere.yml index a362570..f8136f4 100644 --- a/data/rules/cohere.yml +++ b/data/rules/cohere.yml @@ -6,10 +6,13 @@ rules: \b cohere (?:.|[\n\r]){0,16}? + \b ( [A-Z0-9]{40} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/coinbase.yml b/data/rules/coinbase.yml index c5a5763..159dc90 100644 --- a/data/rules/coinbase.yml +++ b/data/rules/coinbase.yml @@ -10,9 +10,11 @@ rules: (?:.|[\n\r]){0,16}? \b ( - [a-zA-Z-0-9]{32} + [a-z-0-9]{32} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 examples: - coinbase_token = 32iAkQCcHHYxXGx20VogBZoj27PC1ouI diff --git a/data/rules/confluent.yml b/data/rules/confluent.yml index 7deda55..96fefe2 100644 --- a/data/rules/confluent.yml +++ b/data/rules/confluent.yml @@ -10,6 +10,8 @@ rules: [A-Z0-9]{16} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3 confidence: medium visible: false @@ -30,6 +32,7 @@ rules: ( [A-Z0-9\+/]{64} ) + \b min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/contentful.yml b/data/rules/contentful.yml index 3846abf..3c91b77 100644 --- a/data/rules/contentful.yml +++ b/data/rules/contentful.yml @@ -10,9 +10,12 @@ rules: (?:.|[\n\r]){0,32}? \b ( - [a-zA-Z0-9_-]{43,45} + [A-Z0-9_-]{43,45} ) \b + pattern_requirements: + min_digits: 2 + min_special_chars: 1 confidence: medium min_entropy: 4.0 validation: @@ -42,11 +45,11 @@ rules: id: kingfisher.contentful.2 pattern: | (?xi) - \b ( CFPAT-[A-Z0-9_-]{43} ) - \b + pattern_requirements: + min_digits: 2 confidence: medium min_entropy: 3.5 validation: diff --git a/data/rules/coze.yml b/data/rules/coze.yml index df3692a..982578b 100644 --- a/data/rules/coze.yml +++ b/data/rules/coze.yml @@ -1,7 +1,18 @@ rules: - name: Coze Personal Access Token id: kingfisher.coze.1 - pattern: '(?i)\b(pat_[a-zA-Z0-9]{64})\b' + pattern: | + (?xi) + \b + coze + (?:.|[\n\r]){0,32}? + \b + ( + pat_[A-Z0-9]{64} + ) + \b + pattern_requirements: + min_digits: 2 confidence: medium min_entropy: 5.0 validation: diff --git a/data/rules/crates.io.yml b/data/rules/crates.io.yml index 7f6b8f2..4aa7ef6 100644 --- a/data/rules/crates.io.yml +++ b/data/rules/crates.io.yml @@ -8,6 +8,8 @@ rules: cio[A-Z0-9]{32} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/databricks.yml b/data/rules/databricks.yml index 9cec5c3..294cb20 100644 --- a/data/rules/databricks.yml +++ b/data/rules/databricks.yml @@ -2,12 +2,17 @@ rules: - name: Databricks API token id: kingfisher.databricks.1 pattern: | - (?xi) + (?xi) + \b ( dapi [a-f0-9]{32} ) - \b + \b + pattern_requirements: + min_digits: 2 + min_uppercase: 1 + min_lowercase: 1 min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/datadog.yml b/data/rules/datadog.yml index 79ff171..6958bf8 100644 --- a/data/rules/datadog.yml +++ b/data/rules/datadog.yml @@ -13,6 +13,8 @@ rules: [a-z0-9]{32} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: @@ -47,11 +49,13 @@ rules: (?:.|[\n\r]){0,64}? (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN) (?:.|[\n\r]){0,16}? - \b ( [a-z0-9]{40} ) - \b + pattern_requirements: + min_digits: 2 + min_uppercase: 1 + min_lowercase: 1 min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/deepgram.yml b/data/rules/deepgram.yml index 850afa9..5a474f6 100644 --- a/data/rules/deepgram.yml +++ b/data/rules/deepgram.yml @@ -6,11 +6,13 @@ rules: \b deepgram (?:.|[\n\r]){0,32}? - \b + \b ( [0-9a-f]{40} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/deepseek.yml b/data/rules/deepseek.yml index f303c48..233ae52 100644 --- a/data/rules/deepseek.yml +++ b/data/rules/deepseek.yml @@ -6,8 +6,10 @@ rules: \b ( sk-[a-f0-9]{32} - ) + ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.7 confidence: medium examples: diff --git a/data/rules/dependency_track.yml b/data/rules/dependency_track.yml index e41aa32..10f397c 100644 --- a/data/rules/dependency_track.yml +++ b/data/rules/dependency_track.yml @@ -8,6 +8,8 @@ rules: odt_[A-Z0-9]{32,255} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/diffbot.yml b/data/rules/diffbot.yml index 231109b..01c4114 100644 --- a/data/rules/diffbot.yml +++ b/data/rules/diffbot.yml @@ -11,6 +11,8 @@ rules: [0-9a-z]{32} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.0 examples: - diffbot_key = a7424adbafc4624e61482d0f60e43016 diff --git a/data/rules/digitalocean.yml b/data/rules/digitalocean.yml index b02bb52..b6ca932 100644 --- a/data/rules/digitalocean.yml +++ b/data/rules/digitalocean.yml @@ -9,6 +9,8 @@ rules: [a-f0-9]{64} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: @@ -33,12 +35,10 @@ rules: id: kingfisher.digitalocean.2 pattern: | (?x) - \b ( dor_v1_ [a-f0-9]{64} ) - \b min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/discord.yml b/data/rules/discord.yml index 6b8809b..f350c29 100644 --- a/data/rules/discord.yml +++ b/data/rules/discord.yml @@ -10,6 +10,9 @@ rules: ( [0-9a-z_\-]{68} ) + \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium examples: @@ -32,7 +35,6 @@ rules: id: kingfisher.discord.2 pattern: | (?xi) - \b ( [MNO][A-Z0-9_-]{23}\.[A-Z0-9_-]{6}\.[A-Z0-9_-]{27} ) @@ -61,14 +63,11 @@ rules: id: kingfisher.discord.3 pattern: | (?xi) - \b (?:discord|botid|bot_id) (?:.|[\n\r]){0,64}? - \b ( \d{17,19} ) - \b min_entropy: 3.5 visible: false confidence: medium diff --git a/data/rules/django.yml b/data/rules/django.yml index c335542..c9efa82 100644 --- a/data/rules/django.yml +++ b/data/rules/django.yml @@ -5,7 +5,6 @@ rules: (?x) [DJANGO]\w{0,8}SECRET_KEY .{1,16}? - \b ( [A-Za-z0-9*!$@\#&_%^-]{45,55} ) diff --git a/data/rules/docker.yml b/data/rules/docker.yml index 1d37ab8..1503cd4 100644 --- a/data/rules/docker.yml +++ b/data/rules/docker.yml @@ -12,6 +12,8 @@ rules: \} [^}]*? \} + pattern_requirements: + min_digits: 2 min_entropy: 2.0 confidence: medium examples: diff --git a/data/rules/dockerhub.yml b/data/rules/dockerhub.yml index e26d108..45c740b 100644 --- a/data/rules/dockerhub.yml +++ b/data/rules/dockerhub.yml @@ -8,6 +8,8 @@ rules: dckr_pat_[A-Z0-9_-]{27} ) (?: $ | [^A-Z0-9_-] ) + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/doppler.yml b/data/rules/doppler.yml index c8c302a..fde9282 100644 --- a/data/rules/doppler.yml +++ b/data/rules/doppler.yml @@ -4,8 +4,12 @@ rules: pattern: | (?xi) \b - (dp\.ct\.[A-Z0-9]{40,44}) + ( + dp\.ct\.[A-Z0-9]{40,44} + ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/droneci.yml b/data/rules/droneci.yml index 8e26b66..e6a4bd1 100644 --- a/data/rules/droneci.yml +++ b/data/rules/droneci.yml @@ -15,6 +15,8 @@ rules: [a-f0-9]{32,64} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/dropbox.yml b/data/rules/dropbox.yml index d7d4766..d21eb80 100644 --- a/data/rules/dropbox.yml +++ b/data/rules/dropbox.yml @@ -4,11 +4,12 @@ rules: pattern: | (?xi) \b - sl\. ( - [A-Z0-9\-\_]{130,152} + sl\.[A-Z0-9\-\_]{130,152} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/duffel.yml b/data/rules/duffel.yml index 9b52965..da0a928 100644 --- a/data/rules/duffel.yml +++ b/data/rules/duffel.yml @@ -8,6 +8,8 @@ rules: duffel_(?:test|live)_[a-z0-9_\-=]{43} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.2 confidence: medium examples: diff --git a/data/rules/dynatrace.yml b/data/rules/dynatrace.yml index af8a415..29226f4 100644 --- a/data/rules/dynatrace.yml +++ b/data/rules/dynatrace.yml @@ -12,6 +12,8 @@ rules: [A-Z0-9]{64} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/easypost.yml b/data/rules/easypost.yml index 757a70a..d5f1a42 100644 --- a/data/rules/easypost.yml +++ b/data/rules/easypost.yml @@ -6,9 +6,11 @@ rules: \b ( EZ[AT]K - [A-Za-z0-9]{54} + [A-Z0-9]{54} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/elevenlabs.yml b/data/rules/elevenlabs.yml index ade342c..9378a34 100644 --- a/data/rules/elevenlabs.yml +++ b/data/rules/elevenlabs.yml @@ -2,13 +2,15 @@ rules: - name: ElevenLabs API Key id: kingfisher.elevenlabs.1 pattern: | - (?xi) - \b + (?xi) + \b ( sk_ [0-9a-f]{48} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/facebook.yml b/data/rules/facebook.yml index d3ef582..841cda9 100644 --- a/data/rules/facebook.yml +++ b/data/rules/facebook.yml @@ -11,8 +11,10 @@ rules: \b ( \d{15} - ) - \b + ) + \b + pattern_requirements: + min_digits: 15 min_entropy: 2.0 visible: false confidence: medium @@ -37,6 +39,8 @@ rules: - " var fbApiKey = '0278fc1adf6dc1d82a156f306ce2c5cc';" - ' fbApiKey: "171e84fd57f430fc59afa8fad3dbda2a",' - '"facebook appSecret = "ce3f9f0362bbe5ab01dfc8ee565e4372"' + pattern_requirements: + min_digits: 2 validation: type: Http content: @@ -69,9 +73,12 @@ rules: (?:access_token|access[\s-]token) (?:.|[\n\r]){0,32}? )? + ( + EAACEdEose0cBA[A-Z0-9]{20,} + ) \b - (EAACEdEose0cBA[A-Z0-9]{20,}) - \b + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/fastly.yml b/data/rules/fastly.yml index 77618a6..07cadce 100644 --- a/data/rules/fastly.yml +++ b/data/rules/fastly.yml @@ -13,6 +13,8 @@ rules: [a-z0-9_-]{32} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/figma.yml b/data/rules/figma.yml index 7cc6f00..b5faf90 100644 --- a/data/rules/figma.yml +++ b/data/rules/figma.yml @@ -8,6 +8,8 @@ rules: figd_[A-Z0-9_-]{38,42} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium examples: @@ -36,14 +38,14 @@ rules: (?xi) figma (?:.|[\n\r]){0,32}? - \b ( [0-9A-F]{4} -[0-9A-F]{8} (?:-[0-9A-F]{4}){3} -[0-9A-F]{12} ) - \b + pattern_requirements: + min_digits: 2 examples: - "--header='X-Figma-Token: 1394-0ca7a5be-8e22-40ee-8c40-778d41ab2313'" references: diff --git a/data/rules/fileio.yml b/data/rules/fileio.yml index 7bf255e..804fcd4 100644 --- a/data/rules/fileio.yml +++ b/data/rules/fileio.yml @@ -14,6 +14,8 @@ rules: \.[A-Z0-9]{20} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/finicity.yml b/data/rules/finicity.yml index 5273fbf..4a29310 100644 --- a/data/rules/finicity.yml +++ b/data/rules/finicity.yml @@ -13,6 +13,8 @@ rules: [a-f0-9]{32} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/finnhub.yml b/data/rules/finnhub.yml index 01e8c0a..3d71fb6 100644 --- a/data/rules/finnhub.yml +++ b/data/rules/finnhub.yml @@ -11,6 +11,8 @@ rules: [a-z0-9]{20} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.0 confidence: medium examples: diff --git a/data/rules/firecrawl.yml b/data/rules/firecrawl.yml index fc729c6..80665b3 100644 --- a/data/rules/firecrawl.yml +++ b/data/rules/firecrawl.yml @@ -8,6 +8,8 @@ rules: fc-[a-f0-9]{32} ) \b + pattern_requirements: + min_digits: 2 confidence: medium min_entropy: 3.0 validation: diff --git a/data/rules/fireworksai.yml b/data/rules/fireworksai.yml index 0933441..f96d251 100644 --- a/data/rules/fireworksai.yml +++ b/data/rules/fireworksai.yml @@ -8,6 +8,8 @@ rules: fw_[A-Z0-9]{24} ) \b + pattern_requirements: + min_digits: 2 confidence: medium min_entropy: 3.5 validation: diff --git a/data/rules/flickr.yml b/data/rules/flickr.yml index d789348..fb104eb 100644 --- a/data/rules/flickr.yml +++ b/data/rules/flickr.yml @@ -13,6 +13,8 @@ rules: [a-f0-9]{32} ) \b + pattern_requirements: + min_digits: 2 confidence: medium min_entropy: 3.0 validation: @@ -43,11 +45,11 @@ rules: (?:.|[\n\r]){0,32}? (?:OAUTH|ACCESS|TOKEN)? (?:.|[\n\r]){0,32}? - \b ( [a-f0-9]{32} ) - \b + pattern_requirements: + min_digits: 2 confidence: medium min_entropy: 3.0 validation: diff --git a/data/rules/flyio.yml b/data/rules/flyio.yml index 98b3a6a..4764279 100644 --- a/data/rules/flyio.yml +++ b/data/rules/flyio.yml @@ -8,6 +8,8 @@ rules: FlyV1\s[A-Za-z0-9=_\-,/+]{100,} ) \b + pattern_requirements: + min_digits: 2 confidence: medium min_entropy: 4.0 validation: diff --git a/data/rules/frame.io.yml b/data/rules/frame.io.yml index a5ef94b..035aefc 100644 --- a/data/rules/frame.io.yml +++ b/data/rules/frame.io.yml @@ -7,7 +7,11 @@ rules: ( fio-u-(?:[A-Z0-9_-]{16}){4} ) - \b + \b + pattern_requirements: + min_digits: 2 + min_uppercase: 1 + min_lowercase: 1 min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/frameio.yml b/data/rules/frameio.yml index 1910965..cfbfe10 100644 --- a/data/rules/frameio.yml +++ b/data/rules/frameio.yml @@ -7,6 +7,9 @@ rules: ( fio-u-[a-z0-9\-_=]{64} ) + \b + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/freshbooks.yml b/data/rules/freshbooks.yml index 95fadce..d63c14a 100644 --- a/data/rules/freshbooks.yml +++ b/data/rules/freshbooks.yml @@ -11,6 +11,8 @@ rules: [a-z0-9]{64} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/friendli.yml b/data/rules/friendli.yml index ec5f3ec..6881ffc 100644 --- a/data/rules/friendli.yml +++ b/data/rules/friendli.yml @@ -8,6 +8,8 @@ rules: flp_[A-Z0-9]{46} ) \b + pattern_requirements: + min_digits: 2 confidence: medium min_entropy: 3.0 validation: diff --git a/data/rules/gcp.yml b/data/rules/gcp.yml index e23acb4..30d4d1d 100644 --- a/data/rules/gcp.yml +++ b/data/rules/gcp.yml @@ -15,6 +15,8 @@ rules: "auth_provider_x509_cert_url":\s*".+?" (?:[^{}]|\{(?:[^{}]|\{[^{}]*\})*\})* \} + pattern_requirements: + min_digits: 2 min_entropy: 4.5 confidence: high examples: @@ -48,9 +50,13 @@ rules: [=:] \s{0,8} ["']? - ([0-9a-z]{35,40}) + ( + [0-9a-z]{35,40} + ) ["']? - \b + \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/generic.yml b/data/rules/generic.yml index be42e5c..38308f4 100644 --- a/data/rules/generic.yml +++ b/data/rules/generic.yml @@ -5,9 +5,12 @@ rules: (?xi) secret .{0,20} + ( + [0-9a-z]{32,64} + ) \b - ([0-9a-z]{32,64}) - \b + pattern_requirements: + min_digits: 4 min_entropy: 3.3 confidence: low examples: @@ -18,9 +21,12 @@ rules: (?xi) (?: api_key | apikey | access_key | accesskey ) (?:.|[\n\r]){0,8}? + ( + [0-9a-z][0-9a-z\-._/+]{30,62}[0-9a-z] + ) \b - ([0-9a-z][0-9a-z\-._/+]{30,62}[0-9a-z]) - \b + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: low examples: @@ -31,7 +37,6 @@ rules: pattern: | (?xi) (?: username | user) - \b (?:.|[\n\r]){0,16}? (?: password | pass ) (?:.|[\n\r]){0,16}? @@ -66,7 +71,6 @@ rules: pattern: | (?xi) (?: username | user) - \b (?:.|[\n\r]){0,16}? (?: password | pass ) (?:.|[\n\r]){0,16}? @@ -89,7 +93,6 @@ rules: pattern: | (?xi) password - \b (?:.|[\n\r]){0,16}? ["'] ([^$<%@.,\s'"(){}&/\#\-][^\s'"(){}/]{4,}) (?# password ) @@ -130,7 +133,6 @@ rules: id: kingfisher.generic.6 pattern: | (?xi) - \b ( blink\d{3,6} |correcthorsebatterystaple\d{0,6} @@ -144,7 +146,6 @@ rules: |qwerty\d{3,6} |trustno\d{1,6} ) - \b min_entropy: 1.0 confidence: low examples: @@ -158,7 +159,6 @@ rules: pattern: | (?xi) (?: db_user | db_USERNAME | db_name) - \b (?:.|[\n\r]){0,8}? ["'] ([^"']{5,40}) ["'] (?:.|[\n\r]){0,32}? @@ -197,7 +197,6 @@ rules: id: kingfisher.generic.9 pattern: | (?xi) - \b ( (?P [a-z0-9._-]+ \+ [a-z0-9._-]+ ) : diff --git a/data/rules/gitalk.yml b/data/rules/gitalk.yml index 94574a4..d052855 100644 --- a/data/rules/gitalk.yml +++ b/data/rules/gitalk.yml @@ -7,6 +7,8 @@ rules: new \s+ Gitalk \s* \( \s* \{ \s* clientID: \s* '([a-f0-9]{20})', \s* clientSecret: \s* '([a-f0-9]{40})', + pattern_requirements: + min_digits: 2 confidence: medium min_entropy: 3.0 references: diff --git a/data/rules/github.yml b/data/rules/github.yml index 971f10d..88f9a56 100644 --- a/data/rules/github.yml +++ b/data/rules/github.yml @@ -18,6 +18,9 @@ rules: ) ) \b + pattern_requirements: + min_digits: 2 + min_lowercase: 2 min_entropy: 3.5 examples: - "GITHUB_KEY=ghp_XIxB7KMNdAr3zqWtQqhE94qglHqOzn1D1stg" @@ -57,8 +60,9 @@ rules: ( gho_ [A-Z0-9]{36} - ) - \b + ) + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium examples: @@ -90,11 +94,9 @@ rules: id: kingfisher.github.3 pattern: | (?xi) - \b ( (?:ghu|ghs)_[A-Z0-9]{36} ) - \b examples: - ' "token": "ghu_16C7e42F292c69C2E7C10c838347Ae178B4a",' - | @@ -126,11 +128,9 @@ rules: id: kingfisher.github.4 pattern: | (?xi) - \b ( ghr_[A-Z0-9]{76} ) - \b examples: - ' "refresh_token": "ghr_1B4a2e77838347a7E420ce178F2E7c6912E169246c3CE1ccbF66C46812d16D5B1A9Dc86A1498",' references: @@ -210,11 +210,9 @@ rules: id: kingfisher.github.7 pattern: | (?xi) - \b ( github_pat_[0-9A-Z_]{82} ) - \b examples: - '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' validation: diff --git a/data/rules/gitlab.yml b/data/rules/gitlab.yml index 1cdf48c..08ea0ea 100644 --- a/data/rules/gitlab.yml +++ b/data/rules/gitlab.yml @@ -2,12 +2,15 @@ rules: - name: GitLab Private Token id: kingfisher.gitlab.1 pattern: | - (?xi) - \b + (?xi) + \b ( glpat- [0-9A-Z_-]{20} ) + \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium examples: @@ -42,6 +45,8 @@ rules: GR1348941[0-9A-Z_-]{20} ) \b + pattern_requirements: + min_digits: 2 examples: - | sudo gitlab-runner register \ @@ -86,6 +91,8 @@ rules: glptt-[0-9a-f]{40} ) \b + pattern_requirements: + min_digits: 2 examples: - | curl \ @@ -117,11 +124,14 @@ rules: - name: GitLab Private Token - Updated Format id: kingfisher.gitlab.4 pattern: | - (?x) + (?x) \b ( glpat-[A-Za-z0-9_-]{36,38}\.01\.[a-z0-9]{9} ) + \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/gitter.yml b/data/rules/gitter.yml index e2b261c..3da37a3 100644 --- a/data/rules/gitter.yml +++ b/data/rules/gitter.yml @@ -11,6 +11,8 @@ rules: [a-z0-9_-]{40} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.2 confidence: medium examples: diff --git a/data/rules/gocardless.yml b/data/rules/gocardless.yml index e261372..525b20b 100644 --- a/data/rules/gocardless.yml +++ b/data/rules/gocardless.yml @@ -13,7 +13,11 @@ rules: (?:[A-Z0-9=_-]{8}){3} [A-Z0-9=_-]{0,2} ) - \b + \b + pattern_requirements: + min_digits: 2 + min_uppercase: 1 + min_lowercase: 1 min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/google.yml b/data/rules/google.yml index d7d4aed..c338245 100644 --- a/data/rules/google.yml +++ b/data/rules/google.yml @@ -15,9 +15,12 @@ rules: id: kingfisher.google.2 pattern: | (?xi) - \b (GOCSPX-[A-Z0-9_-]{28}) (?:[^A-Z0-9_-] | $) + pattern_requirements: + min_digits: 2 + min_uppercase: 1 + min_lowercase: 1 min_entropy: 3.3 confidence: medium examples: @@ -28,9 +31,14 @@ rules: pattern: | (?xi) client.?secret .{0,10} - \b - ([a-z0-9_-]{24}) + ( + [a-z0-9_-]{24} + ) (?: [^a-z0-9_-] |$) + pattern_requirements: + min_digits: 4 + min_uppercase: 3 + min_lowercase: 3 min_entropy: 3.3 confidence: medium examples: @@ -42,9 +50,12 @@ rules: id: kingfisher.google.4 pattern: | (?xi) - \b (ya29\.[0-9A-Z_-]{20,1024}) (?: [^0-9A-Z_-]|$) + pattern_requirements: + min_digits: 2 + min_uppercase: 1 + min_lowercase: 1 min_entropy: 3.3 confidence: medium examples: @@ -66,12 +77,10 @@ rules: id: kingfisher.google.6 pattern: | (?xi) - \b ([0-9]+-[a-z0-9_]{32}\.apps\.googleusercontent\.com) (?: (?s).{0,40} ) - \b (?: (GOCSPX-[A-Z0-9_-]{28}) | @@ -80,6 +89,10 @@ rules: ) ) (?:[^A-Z0-9_-] | $) + pattern_requirements: + min_digits: 2 + min_uppercase: 1 + min_lowercase: 1 min_entropy: 3.3 confidence: medium examples: @@ -98,11 +111,16 @@ rules: id: kingfisher.google.7 pattern: | (?xi) + \b ( AIza [A-Za-z0-9_-]{35} ) \b + pattern_requirements: + min_digits: 2 + min_uppercase: 1 + min_lowercase: 1 min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/grafana.yml b/data/rules/grafana.yml index 4cb5461..a7c9726 100644 --- a/data/rules/grafana.yml +++ b/data/rules/grafana.yml @@ -8,6 +8,8 @@ rules: eyJrIjoi[a-z0-9]{60,100} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: @@ -20,12 +22,15 @@ rules: id: kingfisher.grafana.2 pattern: | (?xi) - \b + \b ( glc_ [a-z0-9+/]{40,150} ={0,2} ) + pattern_requirements: + min_digits: 2 + min_lowercase: 2 min_entropy: 3.3 confidence: medium examples: @@ -56,6 +61,8 @@ rules: glsa_[A-Z0-9]{32}_[A-F0-9]{8} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/groq.yml b/data/rules/groq.yml index b232ddd..deab409 100644 --- a/data/rules/groq.yml +++ b/data/rules/groq.yml @@ -8,6 +8,8 @@ rules: gsk_[a-zA-Z0-9]{52} ) \b + pattern_requirements: + min_digits: 2 confidence: medium min_entropy: 4.5 validation: diff --git a/data/rules/hashes.yml b/data/rules/hashes.yml index 60b0f4a..bf6f5af 100644 --- a/data/rules/hashes.yml +++ b/data/rules/hashes.yml @@ -7,6 +7,8 @@ rules: - https://unix.stackexchange.com/a/511017 - https://hashcat.net/wiki/doku.php?id=example_hashes - https://passwordvillage.org/salted.html#md5crypt + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: # generated with `openssl passwd -1 -salt 'OKgLCmVl' 'a'` @@ -38,6 +40,8 @@ rules: \$ [./A-Za-z0-9]{8,16} \$ [./A-Za-z0-9]{43} ) + pattern_requirements: + min_digits: 2 references: - https://en.wikipedia.org/wiki/Crypt_(C)#Key_derivation_functions_supported_by_crypt - https://hashcat.net/wiki/doku.php?id=example_hashes @@ -58,6 +62,8 @@ rules: \$ [./A-Za-z0-9]{8,16} \$ [./A-Za-z0-9]{86} ) + pattern_requirements: + min_digits: 2 references: - https://en.wikipedia.org/wiki/Crypt_(C)#Key_derivation_functions_supported_by_crypt - https://hashcat.net/wiki/doku.php?id=example_hashes @@ -76,6 +82,8 @@ rules: \$ [./A-Za-z0-9]{8,16} \$ [./A-Za-z0-9]{43} ) + pattern_requirements: + min_digits: 2 references: - https://en.wikipedia.org/wiki/Crypt_(C)#Key_derivation_functions_supported_by_crypt - https://hashcat.net/wiki/doku.php?id=example_hashes @@ -96,7 +104,8 @@ rules: [0-9a-f]{32} \$ [0-9a-f]{64,} ) - \b + pattern_requirements: + min_digits: 2 references: - https://hashcat.net/wiki/doku.php?id=example_hashes min_entropy: 3.3 diff --git a/data/rules/hashicorp.yml b/data/rules/hashicorp.yml index 20adfc1..b6a0de1 100644 --- a/data/rules/hashicorp.yml +++ b/data/rules/hashicorp.yml @@ -5,16 +5,20 @@ rules: pattern: | (?x) (?i: hashicorp | vault | token | key | secret ) - ["':=\ ]{0,5} + (?:.|[\n\r]){0,32}? \b - (s\.[A-Za-z0-9_-]{24,128}) - (?: [^A-Za-z0-9_-] | $ ) + ( + s\.[A-Za-z0-9_-]{24,128} + ) + \b + pattern_requirements: + min_digits: 2 confidence: medium min_entropy: 3.0 examples: - 'VAULT_CLIENT_TOKEN="s.Z4bTMtngfLeQ18AqVoBBkUAOD1"' - - '`-vaultToken s.CAESIP2jTxc9S3K7Z6CtcFWQv7-044m_oSsxkingfisher.0H3nF89l3GiYKHGh3cy5sQmlIZVNyTWJNcDRsYWJpQjlhYjVlb2cQh6PL8wEYAg"`' + - 'vaultToken="s.CAESIP2jTxc9S3K7Z6CtcFWQv7-044m_oS.0H3nF89l3GiYKHGh3cy5sQmlIZVNyTWJNcDRsYWJpQjlhYjVlb2cQh6PL8wEYAg"`' references: - https://developer.hashicorp.com/vault/docs/concepts/tokens @@ -26,9 +30,10 @@ rules: (?x) (?i: hashicorp | vault | token | key | secret ) ["':=\ ]{0,5} - \b (b\.[A-Za-z0-9_-]{24,500}) (?: [^A-Za-z0-9_-] | $ ) + pattern_requirements: + min_digits: 2 examples: - 'VAULT_CLIENT_TOKEN="b.Z4bTMtngfLeQ18AqVoBBkUAOD1"' @@ -44,9 +49,10 @@ rules: (?x) (?i: hashicorp | vault | token | key | secret ) ["':=\ ]{0,5} - \b (r\.[A-Za-z0-9_-]{24,500}) (?: [^A-Za-z0-9_-] | $ ) + pattern_requirements: + min_digits: 2 examples: - 'VAULT_CLIENT_TOKEN="r.Z4bTMtngfLeQ18AqVoBBkUAOD1"' @@ -62,9 +68,10 @@ rules: pattern: | (?x) - \b (hvs\.[A-Za-z0-9]{24,130}) (?: [^A-Za-z0-9_-] | $ ) + pattern_requirements: + min_digits: 2 examples: - "apikey: hvs.JGbZZaCkOSgsZ56uhGlTK2zyC1j2mwhy0VLp4" @@ -79,9 +86,10 @@ rules: pattern: | (?x) - \b (hvb\.[A-Za-z0-9_-]{24,500}) (?: [^A-Za-z0-9_-] | $ ) + pattern_requirements: + min_digits: 2 examples: - "apikey: hvb.JGbZZaCkOSgsZ56uhGlTK2zyC1j2mwhy0VLp4" @@ -98,9 +106,10 @@ rules: pattern: | (?x) - \b (hvr\.[A-Za-z0-9]{24,130}) (?: [^A-Za-z0-9_-] | $ ) + pattern_requirements: + min_digits: 2 examples: - "apikey: hvr.JGbZZaCkOSgsZ56uhGlTK2zyC1j2mwhy0VLp4" @@ -119,9 +128,10 @@ rules: (?i: unseal ) \b .{1,10} - \b ([a-zA-Z0-9+/]{44}) (?: [^a-zA-Z0-9+/] | $ ) + pattern_requirements: + min_digits: 2 examples: - "Unseal Key 2: 0tZn+7QQCxphpHwTm7/dC3LpP5JGIbYl3PK8Sy81R+P2" diff --git a/data/rules/heroku.yml b/data/rules/heroku.yml index b06d58e..96e94e4 100644 --- a/data/rules/heroku.yml +++ b/data/rules/heroku.yml @@ -5,13 +5,15 @@ rules: (?xi) \b heroku - (?:.|[\n\r]){0,32}? + (?:.|[\n\r]){0,32}? \b ( [0-9a-f]{8}-[0-9a-f]{4}- [0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.0 confidence: medium examples: @@ -35,10 +37,11 @@ rules: id: kingfisher.heroku.2 pattern: | (?xi) - \b ( HRKU-[A-Z0-9_]{60} ) + pattern_requirements: + min_digits: 2 confidence: medium min_entropy: 4.0 validation: diff --git a/data/rules/hubspot.yml b/data/rules/hubspot.yml index 2840203..5cb5372 100644 --- a/data/rules/hubspot.yml +++ b/data/rules/hubspot.yml @@ -8,6 +8,8 @@ rules: pat-[a-z0-9]{2,3}-[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12} ) \b + pattern_requirements: + min_digits: 2 confidence: medium min_entropy: 3.0 validation: diff --git a/data/rules/huggingface.yml b/data/rules/huggingface.yml index d103438..c70b993 100644 --- a/data/rules/huggingface.yml +++ b/data/rules/huggingface.yml @@ -3,7 +3,6 @@ rules: id: kingfisher.huggingface.1 pattern: | (?xi) - \b (?: ( (?:api_org|hf)_ @@ -11,6 +10,8 @@ rules: ) ) \b + pattern_requirements: + min_digits: 2 references: - https://huggingface.co/docs/hub/security-tokens min_entropy: 3.3 diff --git a/data/rules/ibm.yml b/data/rules/ibm.yml index 55a33be..a2e7ec6 100644 --- a/data/rules/ibm.yml +++ b/data/rules/ibm.yml @@ -3,15 +3,16 @@ rules: id: kingfisher.ibm.1 pattern: | (?xi) - \b (?:ibm(?:cloud)?|bx) (?:.|[\n\r]){0,32}? (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN) (?:.|[\n\r]){0,32}? - \b ( [0-9A-Z_-]{42,44} ) + \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium diff --git a/data/rules/infracost.yml b/data/rules/infracost.yml index 598b6c3..42dc8d6 100644 --- a/data/rules/infracost.yml +++ b/data/rules/infracost.yml @@ -8,6 +8,8 @@ rules: ico-[a-z0-9]{32} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/intercom.yml b/data/rules/intercom.yml index e521759..d6f159e 100644 --- a/data/rules/intercom.yml +++ b/data/rules/intercom.yml @@ -7,9 +7,12 @@ rules: (?:.|[\n\r]){0,16}? (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN) (?:.|[\n\r]){0,16}? + \b ( [0-9A-Z+/]{59}= ) + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium diff --git a/data/rules/ionic.yml b/data/rules/ionic.yml index 1130a38..b37b83b 100644 --- a/data/rules/ionic.yml +++ b/data/rules/ionic.yml @@ -7,8 +7,10 @@ rules: ( ion_ [a-z0-9]{42} - ) - \b + ) + \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/ipstack.yml b/data/rules/ipstack.yml index a70e43e..1dbb576 100644 --- a/data/rules/ipstack.yml +++ b/data/rules/ipstack.yml @@ -13,6 +13,8 @@ rules: (?:[0-9a-f]{16}){2} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.0 confidence: medium examples: diff --git a/data/rules/jenkins.yml b/data/rules/jenkins.yml index f8fbb77..36a1d49 100644 --- a/data/rules/jenkins.yml +++ b/data/rules/jenkins.yml @@ -2,7 +2,8 @@ rules: - name: Jenkins Token or Crumb id: kingfisher.jenkins.1 pattern: '(?i)jenkins.{0,10}(?:crumb)?.{0,10}\b([0-9a-f]{32,36})\b' - categories: [api, fuzzy, secret] + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: @@ -17,8 +18,6 @@ rules: export JENKINS=jenkins-cicd.apps.sno.openshiftlabs.net - | sh "curl -X POST 'http://jenkins.lsfusion.luxsoft.by/job/${Paths.updateParentVersionsJob}/build' --user ${USERPASS} -H 'Jenkins-Crumb:440561953171ba44ace9740562d172bb'" - negative_examples: - - '1. ~~Does not play well with [Build Token Root Plugin](https://wiki.jenkins-ci.org/display/JENKINS/Build+Token+Root+Plugin) URL formats.~~ (added with [this commit](https://github.com/morficus/Parameterized-Remote-Trigger-Plugin/commit/f687dbe75d1c4f39f7e14b68220890384d7c5674) )' references: - https://www.jenkins.io/blog/2018/07/02/new-api-token-system/ - https://www.jenkins.io/doc/book/security/csrf-protection/ \ No newline at end of file diff --git a/data/rules/jina.yml b/data/rules/jina.yml index efe5b10..807f5a7 100644 --- a/data/rules/jina.yml +++ b/data/rules/jina.yml @@ -4,8 +4,12 @@ rules: pattern: | (?x) \b - (jina_[a-zA-Z0-9]{60}) + ( + jina_[a-zA-Z0-9]{60} + ) \b + pattern_requirements: + min_digits: 2 confidence: medium min_entropy: 3.0 examples: diff --git a/data/rules/jira.yml b/data/rules/jira.yml index a555d85..2be3a53 100644 --- a/data/rules/jira.yml +++ b/data/rules/jira.yml @@ -2,11 +2,14 @@ rules: - name: Jira Domain id: kingfisher.jira.1 pattern: | - (?xi) + (?xi) + \b ( [a-z][a-z0-9-]{5,24}\.atlassian\.net ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 visible: false confidence: medium @@ -26,8 +29,12 @@ rules: \b ( [a-z0-9-]{24} - ) - \b + ) + \b + pattern_requirements: + min_digits: 2 + min_uppercase: 1 + min_lowercase: 1 min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/jwt.yml b/data/rules/jwt.yml index b237b4c..b39d991 100644 --- a/data/rules/jwt.yml +++ b/data/rules/jwt.yml @@ -3,7 +3,6 @@ rules: id: kingfisher.jwt.1 pattern: | (?x) - \b ( (?:ey|ewogIC)[A-Za-z0-9_-]{12,} (?# header ) \. @@ -11,7 +10,9 @@ rules: \. [A-Za-z0-9_-]{12,} (?# signature ) ) - (?:[^A-Z0-9_-]|$) (?# this instead of a \b anchor because that doesn't play nicely with `-` ) + (?:[^A-Z0-9_-]|$) + pattern_requirements: + min_digits: 4 min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/kagi.yml b/data/rules/kagi.yml index 17732af..b1d311c 100644 --- a/data/rules/kagi.yml +++ b/data/rules/kagi.yml @@ -3,12 +3,12 @@ rules: id: kingfisher.kagi.1 pattern: | (?x)(?s) - \b (?: kagi | KAGI ) .{0,100} - \b ( [a-zA-Z0-9_-]{11}\.[a-zA-Z0-9_-]{43} ) (?: $ | [^a-zA-Z0-9_-] ) + pattern_requirements: + min_digits: 2 confidence: medium min_entropy: 3.0 references: diff --git a/data/rules/kickbox.yml b/data/rules/kickbox.yml index 35a4e38..897d25d 100644 --- a/data/rules/kickbox.yml +++ b/data/rules/kickbox.yml @@ -13,6 +13,8 @@ rules: [A-Z0-9_]+[A-Z0-9]{64} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/langchain.yml b/data/rules/langchain.yml index cd853e2..6f4da05 100644 --- a/data/rules/langchain.yml +++ b/data/rules/langchain.yml @@ -8,6 +8,8 @@ rules: lsv2_(?:pt)_[0-9a-f]{32}_[0-9a-f]{10} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 4.0 examples: - "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" @@ -34,6 +36,8 @@ rules: lsv2_sk_[0-9a-f]{32}_[0-9a-f]{10} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 4.0 examples: - "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" diff --git a/data/rules/launchdarkly.yml b/data/rules/launchdarkly.yml index f6d15d5..26a27d3 100644 --- a/data/rules/launchdarkly.yml +++ b/data/rules/launchdarkly.yml @@ -9,6 +9,8 @@ rules: ( [a-z0-9_\-=]{40} ) + pattern_requirements: + min_digits: 2 min_entropy: 3.2 confidence: medium examples: diff --git a/data/rules/line.yml b/data/rules/line.yml index c3b43dd..fc6ec4e 100644 --- a/data/rules/line.yml +++ b/data/rules/line.yml @@ -12,6 +12,8 @@ rules: ( (?:[0-9A-Z+/]{57}){3}=? ) + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/linear.yml b/data/rules/linear.yml index fdaa30d..3b73105 100644 --- a/data/rules/linear.yml +++ b/data/rules/linear.yml @@ -7,8 +7,10 @@ rules: ( lin_api_ (?:[0-9A-Z]{8}){5} - ) - \b + ) + \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/linkedin.yml b/data/rules/linkedin.yml index 7663c28..cd293b7 100644 --- a/data/rules/linkedin.yml +++ b/data/rules/linkedin.yml @@ -10,6 +10,8 @@ rules: (?: id | identifier | key ) .{0,2} \s{0,20} .{0,2} \s{0,20} .{0,2} \b ([a-z0-9]{12,14}) \b + pattern_requirements: + min_digits: 2 references: - https://docs.microsoft.com/en-us/linkedin/shared/api-guide/best-practices/secure-applications min_entropy: 2.5 @@ -44,6 +46,8 @@ rules: (?: key | oauth | sec | secret )? .{0,2} \s{0,20} .{0,2} \s{0,20} .{0,2} \b ([a-z0-9]{16}) \b + pattern_requirements: + min_digits: 2 references: - https://docs.microsoft.com/en-us/linkedin/shared/api-guide/best-practices/secure-applications min_entropy: 3.3 diff --git a/data/rules/lob.yml b/data/rules/lob.yml index 152c445..deb8800 100644 --- a/data/rules/lob.yml +++ b/data/rules/lob.yml @@ -10,6 +10,8 @@ rules: (?:live|test)_[a-f0-9]{35} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium examples: @@ -42,6 +44,8 @@ rules: (?:test|live)_pub_[a-f0-9]{31} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.0 confidence: medium examples: diff --git a/data/rules/mailchimp.yml b/data/rules/mailchimp.yml index ffa5d3a..c3f6962 100644 --- a/data/rules/mailchimp.yml +++ b/data/rules/mailchimp.yml @@ -13,6 +13,9 @@ rules: (?:[0-9a-f]{8}){4} -us\d{1,2} ) + \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/mailgun.yml b/data/rules/mailgun.yml index 06a02c2..5181f3b 100644 --- a/data/rules/mailgun.yml +++ b/data/rules/mailgun.yml @@ -11,7 +11,11 @@ rules: \b ( (?:[0-9A-Z-]{24}){3} - ) + ) + pattern_requirements: + min_digits: 2 + min_uppercase: 1 + min_lowercase: 1 min_entropy: 3.5 confidence: medium examples: @@ -41,6 +45,8 @@ rules: key-(?:[0-9a-f]{8}){4} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/mandrill.yml b/data/rules/mandrill.yml index 4abd63d..857fbeb 100644 --- a/data/rules/mandrill.yml +++ b/data/rules/mandrill.yml @@ -11,7 +11,11 @@ rules: \b ( (?:[0-9A-Z_-]{11}){2} - ) + ) + pattern_requirements: + min_digits: 2 + min_uppercase: 1 + min_lowercase: 1 min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/mapbox.yml b/data/rules/mapbox.yml index d0253d6..ecbfa29 100644 --- a/data/rules/mapbox.yml +++ b/data/rules/mapbox.yml @@ -2,6 +2,8 @@ rules: - name: Mapbox Public Access Token id: kingfisher.mapbox.1 pattern: '(?i)(?s)mapbox.{0,30}(pk\.[a-z0-9\-+/=]{32,128}\.[a-z0-9\-+/=]{20,30})(?:[^a-z0-9\-+/=]|$)' + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: @@ -27,6 +29,8 @@ rules: - name: Mapbox Secret Access Token id: kingfisher.mapbox.2 pattern: '(?i)(?s)mapbox.{0,30}(sk\.[a-z0-9\-+/=]{32,128}\.[a-z0-9\-+/=]{20,30})(?:[^a-z0-9\-+/=]|$)' + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: @@ -51,6 +55,8 @@ rules: - name: Mapbox Temporary Access Token id: kingfisher.mapbox.3 pattern: '(?i)(?s)mapbox.{0,30}(tk\.[a-z0-9\-+/=]{32,128}\.[a-z0-9\-+/=]{20,30})(?:[^a-z0-9\-+/=]|$)' + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/mattermost.yml b/data/rules/mattermost.yml index 564adb6..ef69c78 100644 --- a/data/rules/mattermost.yml +++ b/data/rules/mattermost.yml @@ -34,6 +34,8 @@ rules: [A-Z0-9]{26} ) \b + pattern_requirements: + min_digits: 2 confidence: medium min_entropy: 4.0 examples: diff --git a/data/rules/maxmind.yml b/data/rules/maxmind.yml index c0839b8..9952118 100644 --- a/data/rules/maxmind.yml +++ b/data/rules/maxmind.yml @@ -8,6 +8,8 @@ rules: [a-z0-9]{6}_[a-z0-9]{29}_mmk ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.8 confidence: medium examples: @@ -42,11 +44,9 @@ rules: (?:.|[\n\r]){0,10}? (?:id|number) (?:.|[\n\r]){0,16}? - \b ( \d{4,8} ) - \b min_entropy: 2.0 confidence: medium visible: false diff --git a/data/rules/messagebird.yml b/data/rules/messagebird.yml index 8ae1a58..4b89531 100644 --- a/data/rules/messagebird.yml +++ b/data/rules/messagebird.yml @@ -6,10 +6,13 @@ rules: \b message[_-]?bird (?:.|[\n\r]){0,32}? + \b ( [a-z0-9]{25} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.4 confidence: medium examples: diff --git a/data/rules/microsoft_teams.yml b/data/rules/microsoft_teams.yml index 37e4030..1f7ac49 100644 --- a/data/rules/microsoft_teams.yml +++ b/data/rules/microsoft_teams.yml @@ -26,6 +26,8 @@ rules: [0-9a-f]{4}- [0-9a-f]{12} ) + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/microsoftteamswebhook.yml b/data/rules/microsoftteamswebhook.yml index 4da1c1d..952f1b9 100644 --- a/data/rules/microsoftteamswebhook.yml +++ b/data/rules/microsoftteamswebhook.yml @@ -3,17 +3,23 @@ rules: id: kingfisher.microsoftteamswebhook.1 pattern: | (?xi) - https://[A-Z0-9]+\.webhook\.office\.com/webhookb2 - / - [A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12} - @ - [A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12} - / - IncomingWebhook - / - [A-Z0-9]{32} - / - [A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12} + \b + ( + https://[A-Z0-9]+\.webhook\.office\.com/webhookb2 + / + [A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12} + @ + [A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12} + / + IncomingWebhook + / + [A-Z0-9]{32} + / + [A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12} + ) + \b + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/mistral.yml b/data/rules/mistral.yml index 07b1af8..3e451da 100644 --- a/data/rules/mistral.yml +++ b/data/rules/mistral.yml @@ -13,6 +13,8 @@ rules: [A-Z0-9]{32} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.0 confidence: medium examples: diff --git a/data/rules/mongodb.yml b/data/rules/mongodb.yml index dc02e94..28b4031 100644 --- a/data/rules/mongodb.yml +++ b/data/rules/mongodb.yml @@ -11,6 +11,7 @@ rules: .{0,1000}? (?:private|priv|secret|auth|pass|key) (?:.|[\n\r]){0,32}? + \b ( [a-fA-F0-9]{8} - @@ -21,7 +22,12 @@ rules: [a-fA-F0-9]{4} - [a-fA-F0-9]{12} - ) + ) + \b + pattern_requirements: + min_digits: 2 + min_uppercase: 1 + min_lowercase: 1 min_entropy: 3.7 examples: - ATLAS_PRIVATE_KEY=4b18315e-6b7d-4337-b449-5d38f5a189ec @@ -90,7 +96,6 @@ rules: ( mdb_sa_sk_[0-9A-Z_-]{6}[0-9A-Z]{34} ) - \b min_entropy: 3.5 examples: - mdb_sa_sk_BdIX_jLzut2WTgglKzKvSgWMDDj5hEoTqdwOyLOL diff --git a/data/rules/nasa.yml b/data/rules/nasa.yml index efbcc42..1509b2e 100644 --- a/data/rules/nasa.yml +++ b/data/rules/nasa.yml @@ -8,10 +8,13 @@ rules: (?:.|[\n\r]){0,32}? (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN) (?:.|[\n\r]){0,32}? + \b ( [A-Z0-9]{40} ) \b + pattern_requirements: + min_digits: 2 examples: - | get('https://api.nasa.gov/planetary/earth/imagery?api_key=fWfSMcDzyHfMuH8BW6jiIUBYaj0hKRyKBRTBqgEQ') diff --git a/data/rules/netlify.yml b/data/rules/netlify.yml index 0a29d25..7fae1eb 100644 --- a/data/rules/netlify.yml +++ b/data/rules/netlify.yml @@ -8,8 +8,12 @@ rules: (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN) (?:.|[\n\r]){0,32}? \b - ([a-f0-9]{60,64}) + ( + [a-f0-9]{60,64} + ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.3 examples: - netlify_token=3cdfad7b885a6daceff3fb820389115750b373763fb30b10ca0382648b55872d @@ -41,6 +45,8 @@ rules: [A-Z0-9_-]{43,45} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/newrelic.yml b/data/rules/newrelic.yml index d9a582c..c208aa0 100644 --- a/data/rules/newrelic.yml +++ b/data/rules/newrelic.yml @@ -14,6 +14,9 @@ rules: - [A-Z0-9_.]{42} ) + \b + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/ngrok.yml b/data/rules/ngrok.yml index 56f1b62..951cfd5 100644 --- a/data/rules/ngrok.yml +++ b/data/rules/ngrok.yml @@ -7,7 +7,6 @@ rules: (?:.|[\\n\r]){0,32}? (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN) (?:.|[\n\r]){0,32}? - \b (?:[a-z0-9]{25,30}_\d[a-z0-9]{20}|(?:cr_|ak_)[a-z0-9]{25,30}) \b min_entropy: 4 diff --git a/data/rules/notion.yml b/data/rules/notion.yml index 642128e..2682c8c 100644 --- a/data/rules/notion.yml +++ b/data/rules/notion.yml @@ -10,6 +10,8 @@ rules: secret_[A-Z0-9]{43} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 4.0 confidence: medium examples: @@ -38,11 +40,9 @@ rules: (?xi) notion (?:.|[\\n\r]){0,32}? - \b ( ntn_[A-Z0-9]{40,55} ) - \b min_entropy: 4.0 confidence: medium references: @@ -74,11 +74,9 @@ rules: (?xi) notion (?:.|[\\n\r]){0,32}? - \b ( nrt_[A-Z0-9_]{40,55} ) - \b min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/npm.yml b/data/rules/npm.yml index 5d2e8c8..e275e14 100644 --- a/data/rules/npm.yml +++ b/data/rules/npm.yml @@ -8,6 +8,8 @@ rules: npm_[A-Z0-9]{36} ) \b + pattern_requirements: + min_digits: 2 references: - https://docs.npmjs.com/about-access-tokens - https://github.com/github/roadmap/issues/557 @@ -35,7 +37,6 @@ rules: id: kingfisher.npm.2 pattern: | (?xi) - \b (?:_authToken|NPM_TOKEN) (?:.|[\n\r]){0,16}? ( diff --git a/data/rules/nuget.yml b/data/rules/nuget.yml index 663a415..3ba909e 100644 --- a/data/rules/nuget.yml +++ b/data/rules/nuget.yml @@ -8,6 +8,8 @@ rules: oy2[a-z0-9]{43} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: @@ -40,11 +42,9 @@ rules: (?:.|[\n\r]){0,32}? (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN) (?:.|[\n\r]){0,32}? - \b ( [a-z0-9]{46} ) - \b min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/nvidia.yml b/data/rules/nvidia.yml index 1dc7b31..3143a80 100644 --- a/data/rules/nvidia.yml +++ b/data/rules/nvidia.yml @@ -8,6 +8,8 @@ rules: nvapi-[A-Z0-9_-]{60,70} ) \b + pattern_requirements: + min_digits: 2 confidence: medium min_entropy: 3.5 examples: diff --git a/data/rules/nytimes.yml b/data/rules/nytimes.yml index 3c0d08a..391dd3f 100644 --- a/data/rules/nytimes.yml +++ b/data/rules/nytimes.yml @@ -10,6 +10,8 @@ rules: [a-z0-9_\-=]{32} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.2 confidence: medium examples: diff --git a/data/rules/okta.yml b/data/rules/okta.yml index 65490b4..16511f4 100644 --- a/data/rules/okta.yml +++ b/data/rules/okta.yml @@ -10,6 +10,9 @@ rules: ( 00[a-z0-9_-]{39}[a-z0-9_] ) + \b + pattern_requirements: + min_digits: 4 min_entropy: 3.3 examples: - okta_api_token=00hqNORUpnTcdFWA5WEM4YwOkw6RXeFw21lFDRKmY1 @@ -44,7 +47,6 @@ rules: id: kingfisher.okta.2 pattern: | (?xi) - \b ( [a-z0-9-]{1,40}\.okta(?:preview|-emea)?\.com ) diff --git a/data/rules/ollama.yml b/data/rules/ollama.yml index a2df4ab..ba686c6 100644 --- a/data/rules/ollama.yml +++ b/data/rules/ollama.yml @@ -10,6 +10,8 @@ rules: ( [a-f0-9]{32}\.[a-zA-Z0-9_-]{24} ) + pattern_requirements: + min_digits: 2 confidence: medium min_entropy: 3.5 validation: diff --git a/data/rules/onepassword.yml b/data/rules/onepassword.yml index b7cc0bc..e7f6183 100644 --- a/data/rules/onepassword.yml +++ b/data/rules/onepassword.yml @@ -6,7 +6,10 @@ rules: \b ( ops_eyj[A-Za-z0-9_-]{80,500} - )\b + ) + \b + pattern_requirements: + min_digits: 2 min_entropy: 4.0 confidence: medium examples: @@ -37,11 +40,11 @@ rules: id: kingfisher.1password.2 pattern: | (?xi) - \b ( A[0-9]-[A-Z0-9]{6}-[A-Z0-9]{6}-[A-Z0-9]{5}(?:-[A-Z0-9]{5}){3} ) - \b + pattern_requirements: + min_digits: 2 min_entropy: 3.8 confidence: medium examples: diff --git a/data/rules/openai.yml b/data/rules/openai.yml index d0b01cb..b6f90e0 100644 --- a/data/rules/openai.yml +++ b/data/rules/openai.yml @@ -8,6 +8,8 @@ rules: sk-[A-Z0-9]{48} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: @@ -38,6 +40,8 @@ rules: (sk-(?:proj|svcacct|None)-[A-Z0-9_-]{100,}) ) \b + pattern_requirements: + min_digits: 2 min_entropy: 4.0 confidence: medium examples: diff --git a/data/rules/openweathermap.yml b/data/rules/openweathermap.yml index c6b0251..b3fa549 100644 --- a/data/rules/openweathermap.yml +++ b/data/rules/openweathermap.yml @@ -6,7 +6,6 @@ rules: id: kingfisher.openweather.1 pattern: | (?xi) - \b (?:pyowm|openweather|owm\b) (?:.|[\n\r]){0,64}? \b @@ -17,6 +16,8 @@ rules: [a-z0-9]{32} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/opsgenie.yml b/data/rules/opsgenie.yml index 012f296..0e1c620 100644 --- a/data/rules/opsgenie.yml +++ b/data/rules/opsgenie.yml @@ -12,6 +12,9 @@ rules: ( [0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12} ) + \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 examples: - opsgenie_api_key = '12345678-9abc-def0-1234-56789abcdef0' diff --git a/data/rules/pagerdutyapikey.yml b/data/rules/pagerdutyapikey.yml index b0a3c9b..7bdf1f0 100644 --- a/data/rules/pagerdutyapikey.yml +++ b/data/rules/pagerdutyapikey.yml @@ -3,7 +3,6 @@ rules: id: kingfisher.pagerduty.1 pattern: | (?xi) - \b (?: pd[_-]? | pagerduty[_-]? | @@ -19,6 +18,8 @@ rules: [a-f0-9]{32} # integration / routing key (32 hex, lower case) ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/particle.io.yml b/data/rules/particle.io.yml index c4fdfaf..eb8dd6b 100644 --- a/data/rules/particle.io.yml +++ b/data/rules/particle.io.yml @@ -5,9 +5,12 @@ rules: (?xi) https://api\.particle\.io/v1/[A-Z0-9_\-\s/"\\?]* (?:access_token=|Authorization:\s*Bearer\s*) + ( + [A-Z0-9]{40} + ) \b - ([A-Z0-9]{40}) - \b + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: @@ -42,7 +45,6 @@ rules: pattern: | (?xi) (?:access_token=|Authorization:\s*Bearer\s*) - \b ([A-Z0-9]{40}) \b [\s"\\]*https://api\.particle\.io/v1 diff --git a/data/rules/pastebin.yml b/data/rules/pastebin.yml index 1ae302e..f19536c 100644 --- a/data/rules/pastebin.yml +++ b/data/rules/pastebin.yml @@ -13,6 +13,8 @@ rules: [A-Z0-9_]{32} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/paypal.yml b/data/rules/paypal.yml index ef65d2d..47fddb1 100644 --- a/data/rules/paypal.yml +++ b/data/rules/paypal.yml @@ -1,56 +1,59 @@ rules: -- name: PayPal OAuth Client ID - id: kingfisher.paypal.1 - pattern: | - (?xi) - paypal - (?:.|[\n\r]){0,8}? - (?:CLIENT|ID|USER) - (?:.|[\n\r]){0,16}? - \b - ( - A[A-Z0-9_-]{78,99} - ) - \b - min_entropy: 3.5 - visible: false - examples: - - paypal_client_id=AZJ6y8Dpr1TYbqAIdhkPzyhjXoY6mIdhkPzyhjXoY6m8GplL7C3zZ3lPrkTIdhkPzyhjXo_Dx3IdhkPzyhjXoY6m + - name: PayPal OAuth Client ID + id: kingfisher.paypal.1 + pattern: | + (?xi) + paypal + (?:.|[\n\r]){0,8}? + (?:CLIENT|ID|USER) + (?:.|[\n\r]){0,16}? + \b + ( + A[A-Z0-9_-]{78,99} + ) + pattern_requirements: + min_digits: 2 + min_entropy: 3.5 + visible: false + examples: + - paypal_client_id=AZJ6y8Dpr1TYbqAIdhkPzyhjXoY6mIdhkPzyhjXoY6m8GplL7C3zZ3lPrkTIdhkPzyhjXo_Dx3IdhkPzyhjXoY6m -- name: PayPal OAuth Secret - id: kingfisher.paypal.2 - pattern: | - (?xi) - paypal - (?:.|[\n\r]){0,16}? - (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN) - (?:.|[\n\r]){0,32}? - \b - ( - [A-Z0-9_.-]{78,120} - ) - \b - min_entropy: 3.5 - examples: - - paypal_secret=EP0uwUsACKVPcbDRaXFYerX2ij6nbsha71cSdynuQWoSt1pIy4qtIs7gJQRmHwKXu5Icv3g1YQZzAywf + - name: PayPal OAuth Secret + id: kingfisher.paypal.2 + pattern: | + (?xi) + paypal + (?:.|[\n\r]){0,16}? + (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN) + (?:.|[\n\r]){0,32}? + \b + ( + [A-Z0-9_.-]{78,120} + ) + \b + pattern_requirements: + min_digits: 2 + min_entropy: 3.5 + examples: + - paypal_secret=EP0uwUsACKVPcbDRaXFYerX2ij6nbsha71cSdynuQWoSt1pIy4qtIs7gJQRmHwKXu5Icv3g1YQZzAywf - validation: - type: Http - content: - request: - method: POST - url: https://api-m.paypal.com/v1/oauth2/token - headers: - Accept: application/json - Accept-Language: en_US - Content-Type: application/x-www-form-urlencoded - Authorization: | - Basic {{ CLIENTID | append: ':' | append: TOKEN | b64enc }} - body: grant_type=client_credentials - response_matcher: - - report_response: true - - type: StatusMatch - status: [200] - depends_on_rule: - - rule_id: kingfisher.paypal.1 - variable: CLIENTID + validation: + type: Http + content: + request: + method: POST + url: https://api-m.paypal.com/v1/oauth2/token + headers: + Accept: application/json + Accept-Language: en_US + Content-Type: application/x-www-form-urlencoded + Authorization: | + Basic {{ CLIENTID | append: ':' | append: TOKEN | b64enc }} + body: grant_type=client_credentials + response_matcher: + - report_response: true + - type: StatusMatch + status: [200] + depends_on_rule: + - rule_id: kingfisher.paypal.1 + variable: CLIENTID diff --git a/data/rules/pem.yml b/data/rules/pem.yml index 00d93c6..0c0d921 100644 --- a/data/rules/pem.yml +++ b/data/rules/pem.yml @@ -8,6 +8,8 @@ rules: ( (?: [a-zA-Z0-9+/=\s"',] | \\r | \\n ) {50,} ) \s* -----END\ .{0,20}\ ?PRIVATE\ KEY\ ?.{0,20}----- + pattern_requirements: + min_digits: 4 min_entropy: 4.5 confidence: high examples: @@ -60,6 +62,8 @@ rules: [a-zA-Z0-9+/=]{50,} ) (?: [^a-zA-Z0-9+/=] | $ ) + pattern_requirements: + min_digits: 4 min_entropy: 4.5 confidence: high examples: diff --git a/data/rules/perplexity.yml b/data/rules/perplexity.yml index 6629372..c9ba476 100644 --- a/data/rules/perplexity.yml +++ b/data/rules/perplexity.yml @@ -8,6 +8,8 @@ rules: pplx-[A-Za-z0-9]{48} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.8 confidence: medium examples: diff --git a/data/rules/planetscale.yml b/data/rules/planetscale.yml index ecbee4f..859531b 100644 --- a/data/rules/planetscale.yml +++ b/data/rules/planetscale.yml @@ -8,6 +8,8 @@ rules: pscale_tkn_[a-z0-9-_]{43} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 4 examples: - pscale_tkn_abcdefghijklmnopqrstuvwxyZ1234567890_ABCDEF @@ -41,11 +43,11 @@ rules: (?:.|[\n\r]){0,16}? (?:USER|ID|NAME) (?:.|[\n\r]){0,16}? - \b ( [a-z0-9]{12} ) - \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 visible: false examples: diff --git a/data/rules/postman.yml b/data/rules/postman.yml index ae7ae7c..8b98b2e 100644 --- a/data/rules/postman.yml +++ b/data/rules/postman.yml @@ -8,6 +8,8 @@ rules: PMAK-[A-Z0-9]{24}-[A-Z0-9]{34} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/postmark.yml b/data/rules/postmark.yml index 6e31071..f48aa57 100644 --- a/data/rules/postmark.yml +++ b/data/rules/postmark.yml @@ -10,6 +10,8 @@ rules: [a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12} ) \b + pattern_requirements: + min_digits: 2 confidence: medium min_entropy: 3.0 examples: diff --git a/data/rules/prefect.yml b/data/rules/prefect.yml index c04c9cb..beba175 100644 --- a/data/rules/prefect.yml +++ b/data/rules/prefect.yml @@ -8,6 +8,8 @@ rules: pnu_[a-z0-9]{36} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.0 confidence: medium examples: diff --git a/data/rules/privkey.yml b/data/rules/privkey.yml index 7b5e551..a936514 100644 --- a/data/rules/privkey.yml +++ b/data/rules/privkey.yml @@ -22,6 +22,8 @@ rules: PRIVATE\sKEY (\sBLOCK)? ----- + pattern_requirements: + min_digits: 2 min_entropy: 4.5 confidence: high examples: @@ -73,7 +75,11 @@ rules: PRIVATE\sKEY (\sBLOCK)? ----- - ) + ) + pattern_requirements: + min_digits: 2 + min_uppercase: 1 + min_lowercase: 1 min_entropy: 4.5 confidence: high examples: diff --git a/data/rules/psexec.yml b/data/rules/psexec.yml index 57cfd66..3cca3d3 100644 --- a/data/rules/psexec.yml +++ b/data/rules/psexec.yml @@ -8,7 +8,6 @@ rules: -p \s* (\S+) (?# password ) min_entropy: 3.3 confidence: medium - categories: [fuzzy, secret] examples: - 'cmd.exe /C PSEXEC \\10.0.94.120 -u Administrator -p dev_admin CMD /C ECHO' - 'PSEXEC.EXE \\LocalComputerIPAddress -u DOMAIN\my-user -p mypass CMD' diff --git a/data/rules/pubnub.yml b/data/rules/pubnub.yml index f29310a..f759ed7 100644 --- a/data/rules/pubnub.yml +++ b/data/rules/pubnub.yml @@ -8,6 +8,8 @@ rules: pub-c-[a-z0-9]{8}(?:-[a-z0-9]{4}){3}-[a-z0-9]{12} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 examples: - pub-c-12345678-1234-1234-1234-123456789012 @@ -34,6 +36,8 @@ rules: sub-c-[a-z0-9]{8}(?:-[a-z0-9]{4}){3}-[a-z0-9]{12} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/pulumi.yml b/data/rules/pulumi.yml index 2298d1d..81980a1 100644 --- a/data/rules/pulumi.yml +++ b/data/rules/pulumi.yml @@ -8,6 +8,8 @@ rules: pul-[a-f0-9]{40} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.3 examples: - pul-18e13e3eebebeb94eac318d421ca8ecc5ca78d5f diff --git a/data/rules/pypi.yml b/data/rules/pypi.yml index 85884ed..dd11dd4 100644 --- a/data/rules/pypi.yml +++ b/data/rules/pypi.yml @@ -8,6 +8,8 @@ rules: pypi-AgEIcHlwaS5vcmc[A-Z0-9_-]{50,} ) (?:[^A-Z0-9_-]|$) + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/rabbitmq.yml b/data/rules/rabbitmq.yml index 953f10e..607d5f6 100644 --- a/data/rules/rabbitmq.yml +++ b/data/rules/rabbitmq.yml @@ -3,7 +3,6 @@ rules: id: kingfisher.rabbitmq.1 pattern: | (?xi) - \b (?: amqps? ) @@ -16,6 +15,8 @@ rules: @ [-.%\w\/:]+ \b + pattern_requirements: + min_special_chars: 1 min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/react.yml b/data/rules/react.yml index c7d5a40..de22892 100644 --- a/data/rules/react.yml +++ b/data/rules/react.yml @@ -18,9 +18,6 @@ rules: - 'REACT_APP_AUTH_USERNAME=bowie' - ' REACT_APP_AUTH_USERNAME=bowie # some comment' - 'REACT_APP_MAILER_USERNAME=smtp_username # Enter your SMTP email username' - negative_examples: - - 'REACT_APP_FRONTEND_LOGIN_FORGOT_USERNAME=$REACT_APP_MATRIX_BASE_URL/classroom/#/forgot_username' - categories: [fuzzy, identifier] references: - https://create-react-app.dev/docs/adding-custom-environment-variables/ - https://stackoverflow.com/questions/48699820/how-do-i-hide-an-api-key-in-create-react-app @@ -44,10 +41,6 @@ rules: - 'REACT_APP_AUTH_PASSWORD=whiteduke' - ' REACT_APP_AUTH_PASSWORD=whiteduke # some comment' - 'REACT_APP_MAILER_PASSWORD=smtp_password # Enter your SMTP email password' - negative_examples: - - ' const password = process.env.REACT_APP_FIREBASE_DEV_PASSWORD || "not-set"' - - 'REACT_APP_FRONTEND_LOGIN_FORGOT_PASSWORD=$REACT_APP_MATRIX_BASE_URL/classroom/#/forgot_password' - categories: [fuzzy, secret] references: - https://create-react-app.dev/docs/adding-custom-environment-variables/ - https://stackoverflow.com/questions/48699820/how-do-i-hide-an-api-key-in-create-react-app \ No newline at end of file diff --git a/data/rules/readme.yml b/data/rules/readme.yml index 0e415ec..6a4b95d 100644 --- a/data/rules/readme.yml +++ b/data/rules/readme.yml @@ -2,11 +2,14 @@ rules: - name: ReadMe API Key id: kingfisher.readme.1 pattern: | - (?x)(?i) + (?xi) \b ( rdme_(?P[a-z0-9]{70}) ) + \b + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/recaptcha.yml b/data/rules/recaptcha.yml index e8f62a6..f13db51 100644 --- a/data/rules/recaptcha.yml +++ b/data/rules/recaptcha.yml @@ -9,6 +9,9 @@ rules: ( 6l[c-f][a-z0-9_-].{36} ) + \b + pattern_requirements: + min_digits: 3 min_entropy: 3 confidence: medium examples: diff --git a/data/rules/replicate.yml b/data/rules/replicate.yml index b5f1a47..6eb6ffd 100644 --- a/data/rules/replicate.yml +++ b/data/rules/replicate.yml @@ -2,13 +2,15 @@ rules: - name: Replicate API Token id: kingfisher.replicate.1 pattern: | - (?x) - \b + (?x) + \b ( r8_ [A-Za-z0-9]{37} ) \b + pattern_requirements: + min_digits: 3 min_entropy: 3.0 confidence: medium examples: diff --git a/data/rules/rubygems.yml b/data/rules/rubygems.yml index 93ace54..bb9487c 100644 --- a/data/rules/rubygems.yml +++ b/data/rules/rubygems.yml @@ -8,8 +8,10 @@ rules: ( rubygems_ [a-z0-9]{42,52} - ) - \b + ) + \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium categories: [api, secret] diff --git a/data/rules/runway.yml b/data/rules/runway.yml index 9c816b2..8e899f7 100644 --- a/data/rules/runway.yml +++ b/data/rules/runway.yml @@ -2,13 +2,15 @@ rules: - name: Runway API Key id: kingfisher.runway.1 pattern: | - (?x) + (?x) \b ( key_ [A-Fa-f0-9]{128} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 4.0 confidence: medium examples: diff --git a/data/rules/salesforce.yml b/data/rules/salesforce.yml index d532346..be3b544 100644 --- a/data/rules/salesforce.yml +++ b/data/rules/salesforce.yml @@ -2,7 +2,7 @@ rules: - name: Salesforce Access / Refresh Token id: kingfisher.salesforce.1 pattern: | - (?xi) + (?xi) \b ( 00 @@ -10,6 +10,8 @@ rules: ! [A-Z0-9._-]{90,120} ) + pattern_requirements: + min_digits: 6 min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/sauce.yml b/data/rules/sauce.yml index 6c8be24..009f33a 100644 --- a/data/rules/sauce.yml +++ b/data/rules/sauce.yml @@ -25,7 +25,6 @@ rules: id: kingfisher.saucelabs.2 pattern: | (?xi) - \b ( (?:api|ondemand)\.(?:us|eu)-(?:west|east|central)-[0-9]\.saucelabs\.com ) @@ -47,11 +46,12 @@ rules: (?:.|[\n\r]){0,32}? (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN) (?:.|[\n\r]){0,32}? - \b ( [a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12} ) \b + pattern_requirements: + min_digits: 4 confidence: medium min_entropy: 3.0 depends_on_rule: diff --git a/data/rules/scale.yml b/data/rules/scale.yml index b0d9b90..81148f2 100644 --- a/data/rules/scale.yml +++ b/data/rules/scale.yml @@ -4,11 +4,13 @@ rules: pattern: | (?x) \b - ( # capture => TOKEN - live_ # live-mode prefix per docs - [0-9a-f]{32} # 32 lowercase hex chars + ( + live_ + [0-9a-f]{32} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.1 confidence: medium examples: @@ -50,6 +52,8 @@ rules: [0-9a-f]{32} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.1 confidence: medium examples: diff --git a/data/rules/scalingo.yml b/data/rules/scalingo.yml index ea39b5a..15351c4 100644 --- a/data/rules/scalingo.yml +++ b/data/rules/scalingo.yml @@ -7,7 +7,8 @@ rules: ( tk-us-[\w-]{48} ) - \b + pattern_requirements: + min_digits: 2 min_entropy: 3.0 confidence: medium examples: diff --git a/data/rules/segment.yml b/data/rules/segment.yml index 1a2518f..234955d 100644 --- a/data/rules/segment.yml +++ b/data/rules/segment.yml @@ -8,6 +8,8 @@ rules: sgp_[A-Z0-9_-]{60,70} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: @@ -33,7 +35,6 @@ rules: id: kingfisher.segment.2 pattern: | (?xi) - \b (?:segment|sgmt) (?:.|[\n\r]){0,16}? (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN) @@ -42,7 +43,8 @@ rules: ( [A-Z0-9_-]{40,50}\.[A-Z0-9_-]{40,50} ) - \b + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/sendbird.yml b/data/rules/sendbird.yml index 9c36a46..d82ec94 100644 --- a/data/rules/sendbird.yml +++ b/data/rules/sendbird.yml @@ -12,6 +12,8 @@ rules: [0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12} ) \b + pattern_requirements: + min_digits: 2 confidence: medium visible: false min_entropy: 3.0 @@ -31,6 +33,8 @@ rules: [a-f0-9]{40} ) \b + pattern_requirements: + min_digits: 2 confidence: medium min_entropy: 4.0 examples: diff --git a/data/rules/sendgrid.yml b/data/rules/sendgrid.yml index 89145fc..96c5bff 100644 --- a/data/rules/sendgrid.yml +++ b/data/rules/sendgrid.yml @@ -11,7 +11,8 @@ rules: \. [0-9A-Z_-]{39,47} ) - \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/sendinblue.yml b/data/rules/sendinblue.yml index 9b53d07..f57a613 100644 --- a/data/rules/sendinblue.yml +++ b/data/rules/sendinblue.yml @@ -8,6 +8,8 @@ rules: xkeysib-[a-f0-9]{64}-[a-z0-9]{16} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.2 confidence: medium examples: diff --git a/data/rules/sentry.yml b/data/rules/sentry.yml index 8eebda8..8d424ba 100644 --- a/data/rules/sentry.yml +++ b/data/rules/sentry.yml @@ -11,6 +11,8 @@ rules: [a-f0-9]{64} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium examples: @@ -41,6 +43,9 @@ rules: ( sntrys_eyJpYXQiO[a-zA-Z0-9+/]{10,200}(?:LCJyZWdpb25fdXJs|InJlZ2lvbl91cmwi|cmVnaW9uX3VybCI6)[a-zA-Z0-9+/]{10,200}={0,2}_[a-zA-Z0-9+/]{43} ) + \b + pattern_requirements: + min_digits: 2 min_entropy: 4.2 confidence: medium examples: @@ -72,6 +77,8 @@ rules: sntryu_[a-f0-9]{64} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/shippo.yml b/data/rules/shippo.yml index cd769e2..e1bae07 100644 --- a/data/rules/shippo.yml +++ b/data/rules/shippo.yml @@ -8,6 +8,8 @@ rules: shippo_(?:live|test)_[a-f0-9]{40} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/shodan.yml b/data/rules/shodan.yml index 99ab150..76f946c 100644 --- a/data/rules/shodan.yml +++ b/data/rules/shodan.yml @@ -13,6 +13,8 @@ rules: [A-Z0-9]{32} ) \b + pattern_requirements: + min_digits: 2 confidence: medium min_entropy: 4.0 validation: diff --git a/data/rules/shopify.yml b/data/rules/shopify.yml index da340e5..bb92ea4 100644 --- a/data/rules/shopify.yml +++ b/data/rules/shopify.yml @@ -8,6 +8,8 @@ rules: (?:shpat|shpca|shppa|shpss)_[a-f0-9]{30,34} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 examples: - 'shopify_app_secret: "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"' diff --git a/data/rules/slack.yml b/data/rules/slack.yml index 9544d10..0e61776 100644 --- a/data/rules/slack.yml +++ b/data/rules/slack.yml @@ -15,6 +15,8 @@ rules: [0-9a-z]{10,66} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 examples: - xapp-1-A05V64V7F2B-5062360157732-9f01726eebe77df2c096a65e95acdd02107b2c1e92ca341cff27ca271b7251b4 @@ -55,7 +57,10 @@ rules: ( xoxe-\d- [A-Z0-9]{140,150} - )\b + ) + \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 examples: - xoxb-853BAAEE-1B2eDb6A4c75-01bB6Da1CE3E98f6fED5AeC07Dc3E94C @@ -89,7 +94,8 @@ rules: B[a-z0-9_-]{8,12}/ [a-z0-9_-]{20,30} ) - \b + pattern_requirements: + min_digits: 2 min_entropy: 3.3 examples: - https://hooks.slack.com/services/TY40v9sZ9/BxIqhIXIi/NGUyXK6nK7HMAqd0ASzXluoV diff --git a/data/rules/snyk.yml b/data/rules/snyk.yml index e6237e3..3ba783f 100644 --- a/data/rules/snyk.yml +++ b/data/rules/snyk.yml @@ -8,10 +8,12 @@ rules: (?:.|[\n\r]){0,32}? (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN) (?:.|[\n\r]){0,32}? - \b ( [A-Z0-9]{8}-(?:[A-Z0-9]{4}-){3}[A-Z0-9]{12} ) + \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 examples: - snyk_token = 123e4567-e89b-12d3-a456-426614174000 diff --git a/data/rules/sonarcloud.yml b/data/rules/sonarcloud.yml index f0b874d..c3307e9 100644 --- a/data/rules/sonarcloud.yml +++ b/data/rules/sonarcloud.yml @@ -13,6 +13,8 @@ rules: [0-9a-z]{40} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 2.5 examples: - sonar_api_token=abcdef0123456789abcdef0123456789abcdef23 diff --git a/data/rules/sonarqube.yml b/data/rules/sonarqube.yml index ceddbee..63be8fe 100644 --- a/data/rules/sonarqube.yml +++ b/data/rules/sonarqube.yml @@ -8,6 +8,9 @@ rules: (?:sq[pua]) _[a-z0-9]{40} ) + \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 examples: - sonar.login=sqp_4b78f8494075e310d62dfdcaeb14be2c78fca2fc @@ -35,11 +38,9 @@ rules: (?xi) sonar.{0,8}host (?:.|[\n\r]){0,64}? - \b ( https?://.*?:\d{2,6} ) - \b min_entropy: 3.5 visible: false examples: diff --git a/data/rules/sourcegraph.yml b/data/rules/sourcegraph.yml index 3aed539..cbda95d 100644 --- a/data/rules/sourcegraph.yml +++ b/data/rules/sourcegraph.yml @@ -4,8 +4,12 @@ rules: pattern: | (?xi) \b - sgp_(?:[a-f0-9]{16}_local_)?[a-f0-9]{40} + ( + sgp_(?:[a-f0-9]{16}_local_)?[a-f0-9]{40} + ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.3 examples: - sgp_210f1131b08e93adcfc3f05faa2d768ff883a61f @@ -41,6 +45,8 @@ rules: (?:sgp_(?:[a-f0-9]{16}_local_)?[a-f0-9]{40}|[a-f0-9]{40}) ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/square.yml b/data/rules/square.yml index d3fd851..c4aa62f 100644 --- a/data/rules/square.yml +++ b/data/rules/square.yml @@ -10,6 +10,9 @@ rules: ( EAAA[a-z0-9\-\+=]{60} ) + \b + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: @@ -33,7 +36,15 @@ rules: - name: Square Access Token id: kingfisher.square.2 - pattern: '(?i)\b(sq0atp-[a-z0-9_-]{22})\b' + pattern: | + (?xi) + \b + ( + sq0atp-[a-z0-9_-]{22} + ) + \b + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: @@ -57,7 +68,15 @@ rules: - name: Square OAuth Secret id: kingfisher.square.3 - pattern: '(?i)\b(sq0csp-[a-z0-9_-]{43})\b' + pattern: | + (?xi) + \b + ( + sq0csp-[a-z0-9_-]{43} + ) + \b + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/stabilityai.yml b/data/rules/stabilityai.yml index 1ac8fd1..8848ce2 100644 --- a/data/rules/stabilityai.yml +++ b/data/rules/stabilityai.yml @@ -2,13 +2,15 @@ rules: - name: Stability AI API Key id: kingfisher.stabilityai.1 pattern: | - (?x) + (?x) \b ( sk- [A-Za-z0-9]{48} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 4.0 confidence: medium examples: diff --git a/data/rules/stackhawk.yml b/data/rules/stackhawk.yml index b8fec69..d001e40 100644 --- a/data/rules/stackhawk.yml +++ b/data/rules/stackhawk.yml @@ -8,6 +8,8 @@ rules: hawk\.[0-9A-Z_-]{20}\.[0-9A-Z_-]{20} ) \b + pattern_requirements: + min_digits: 2 confidence: medium min_entropy: 3.0 examples: diff --git a/data/rules/stripe.yml b/data/rules/stripe.yml index faaa1ab..934dba7 100644 --- a/data/rules/stripe.yml +++ b/data/rules/stripe.yml @@ -8,10 +8,16 @@ rules: (?:.|[\n\r]){0,16}? (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN) (?:.|[\n\r]){0,16}? + \b ( pk_live_ (?:[0-9A-Z]{6}){4,30} - ) + ) + \b + pattern_requirements: + min_digits: 2 + min_uppercase: 1 + min_lowercase: 1 min_entropy: 3.3 confidence: medium categories: [api, key] @@ -27,12 +33,16 @@ rules: (?:.|[\n\r]){0,16}? (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN) (?:.|[\n\r]){0,16}? + \b ( (?: sk|rk )_live_ (?:[0-9A-Z]{8}){3,25} ) + \b + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/supabase.yml b/data/rules/supabase.yml index d297514..b2d7c32 100644 --- a/data/rules/supabase.yml +++ b/data/rules/supabase.yml @@ -8,6 +8,8 @@ rules: sbp_[a-z0-9_-]{40} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.5 confidence: medium examples: @@ -37,8 +39,10 @@ rules: sb_secret_[a-z0-9_-]{31} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 4.0 - confidence: high + confidence: medium validation: type: Http content: @@ -67,9 +71,11 @@ rules: id: kingfisher.supabase.3 pattern: (?xi) + \b ( https:\/\/[a-z0-9]{16,32}\.supabase\.co ) + \b confidence: medium min_entropy: 3.0 visible: false diff --git a/data/rules/tailscale.yml b/data/rules/tailscale.yml index 3544b1f..08fa9eb 100644 --- a/data/rules/tailscale.yml +++ b/data/rules/tailscale.yml @@ -8,6 +8,8 @@ rules: tskey-[a-z]{3,10}-[A-Z0-9_-]{20,36} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.0 confidence: medium examples: diff --git a/data/rules/tavily.yml b/data/rules/tavily.yml index a908a21..1775eb4 100644 --- a/data/rules/tavily.yml +++ b/data/rules/tavily.yml @@ -9,6 +9,8 @@ rules: tvly-[a-zA-Z0-9]{32} ) \b + pattern_requirements: + min_digits: 2 confidence: medium min_entropy: 3.0 references: diff --git a/data/rules/teamcity.yml b/data/rules/teamcity.yml index c3bf68b..b475c32 100644 --- a/data/rules/teamcity.yml +++ b/data/rules/teamcity.yml @@ -12,6 +12,8 @@ rules: \. [A-Za-z0-9_-]{48} ) + pattern_requirements: + min_digits: 2 examples: - '' references: diff --git a/data/rules/telegram.yml b/data/rules/telegram.yml index 9b6dc98..8786d64 100644 --- a/data/rules/telegram.yml +++ b/data/rules/telegram.yml @@ -10,6 +10,8 @@ rules: [A-Z0-9_-]{35} ) \b + pattern_requirements: + min_digits: 2 confidence: medium min_entropy: 3.5 validation: diff --git a/data/rules/thingsboard.yml b/data/rules/thingsboard.yml index 562ee25..ce5ed78 100644 --- a/data/rules/thingsboard.yml +++ b/data/rules/thingsboard.yml @@ -3,8 +3,14 @@ rules: id: kingfisher.thingsboard.1 pattern: | (?x) + \b thingsboard\.cloud/api/v1/ - ([a-z0-9]{20}) + ( + [a-z0-9]{20} + ) + \b + pattern_requirements: + min_digits: 2 confidence: medium min_entropy: 3.0 examples: @@ -20,8 +26,10 @@ rules: pattern: | (?x) "provisionDeviceKey"\s*:\s*" - ([a-z0-9]{20}) - " + ( + [a-z0-9]{20} + ) + \b confidence: medium min_entropy: 3.0 examples: @@ -34,8 +42,10 @@ rules: pattern: | (?x) "provisionDeviceSecret"\s*:\s*" - ([a-z0-9]{20}) - " + ( + [a-z0-9]{20} + ) + \b confidence: medium min_entropy: 3.0 examples: diff --git a/data/rules/togetherai.yml b/data/rules/togetherai.yml index ee43097..e3d9b04 100644 --- a/data/rules/togetherai.yml +++ b/data/rules/togetherai.yml @@ -7,6 +7,9 @@ rules: ( tgp_v1_[A-Z0-9_-]{43} ) + \b + pattern_requirements: + min_digits: 2 confidence: medium min_entropy: 3.0 examples: diff --git a/data/rules/travisci.yml b/data/rules/travisci.yml index 73e75c4..1d4b188 100644 --- a/data/rules/travisci.yml +++ b/data/rules/travisci.yml @@ -13,6 +13,8 @@ rules: [A-Z-_0-9]{22} ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.0 confidence: medium examples: @@ -35,10 +37,8 @@ rules: id: kingfisher.travisci.2 pattern: | (?xis) - \b (?:language|env|deploy|script):[\r\n] (?:.|[\\n\r]){0,256}? - \b ( secure:\s*"?[A-Za-z0-9+/=\\]+"?\s* ) diff --git a/data/rules/truenas.yml b/data/rules/truenas.yml index a0a6d96..57ffcff 100644 --- a/data/rules/truenas.yml +++ b/data/rules/truenas.yml @@ -6,6 +6,8 @@ rules: "params"\s*:\s*\[\s*" (\d+-[a-zA-Z0-9]{64}) "\s*\] + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: @@ -36,6 +38,8 @@ rules: Bearer\s* (\d+-[a-zA-Z0-9]{64}) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/twilio.yml b/data/rules/twilio.yml index b3b2d32..14c5b0e 100644 --- a/data/rules/twilio.yml +++ b/data/rules/twilio.yml @@ -8,6 +8,9 @@ rules: (?:SK|AC)[A-F0-9]{32} ) \b + pattern_requirements: + min_digits: 2 + visible: false min_entropy: 3.5 examples: - | @@ -31,7 +34,10 @@ rules: ( [a-z0-9]{32} ) - \b + pattern_requirements: + min_digits: 2 + min_uppercase: 1 + min_lowercase: 1 min_entropy: 3.0 examples: - Twilio_key=Cd2Bd1dE1201aE2DFFEcfeBafCc3c31D diff --git a/data/rules/twitch.yml b/data/rules/twitch.yml index c17c225..1ab6102 100644 --- a/data/rules/twitch.yml +++ b/data/rules/twitch.yml @@ -6,10 +6,15 @@ rules: \b twitch (?:.|[\n\r]){0,32}? + \b ( [a-z0-9]{30} ) \b + pattern_requirements: + min_digits: 2 + min_uppercase: 1 + min_lowercase: 1 min_entropy: 3.5 confidence: medium examples: diff --git a/data/rules/twitter.yml b/data/rules/twitter.yml index e75b48e..941d48d 100644 --- a/data/rules/twitter.yml +++ b/data/rules/twitter.yml @@ -13,6 +13,10 @@ rules: A{10,}[A-Za-z0-9_\-]{40,200} ) \b + pattern_requirements: + min_digits: 2 + min_uppercase: 1 + min_lowercase: 1 min_entropy: 4.0 confidence: medium examples: @@ -45,11 +49,9 @@ rules: (?:.|[\n\r]){0,32}? (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN) (?:.|[\n\r]){0,32}? - \b ( [A-Z0-9]{25} ) - \b min_entropy: 3.5 visible: false examples: @@ -63,11 +65,9 @@ rules: (?:.|[\n\r]){0,32}? (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN) (?:.|[\n\r]){0,32}? - \b ( [A-Z0-9]{50} ) - \b min_entropy: 4.5 examples: - "TWITTER_SECRET=ZGwXeK2DNCqv49Z9ofwYdqlBgeoHDyh8uoAgHju6OeYC7wTQJq" diff --git a/data/rules/typeform.yml b/data/rules/typeform.yml index a91578d..cfbc3ee 100644 --- a/data/rules/typeform.yml +++ b/data/rules/typeform.yml @@ -9,6 +9,8 @@ rules: ( tfp_[a-z0-9_\-=\.]{59} ) + pattern_requirements: + min_digits: 2 min_entropy: 4.0 confidence: medium examples: diff --git a/data/rules/vercel.yml b/data/rules/vercel.yml index 121b5fe..3ace650 100644 --- a/data/rules/vercel.yml +++ b/data/rules/vercel.yml @@ -11,6 +11,10 @@ rules: [A-Z0-9]{24} ) \b + pattern_requirements: + min_digits: 6 + min_uppercase: 1 + min_lowercase: 1 confidence: medium min_entropy: 3.5 validation: diff --git a/data/rules/vmware.yml b/data/rules/vmware.yml index 9332a37..c623ebf 100644 --- a/data/rules/vmware.yml +++ b/data/rules/vmware.yml @@ -8,7 +8,6 @@ rules: -User \s+ (\S{3,30}) \s+ (?# username ) .{0,50} -Password \s+ (\S{3,30}) (?# password ) - examples: - 'Connect-VIServer -Server 192.168.1.51 -User administrator@vSphere.local -Password VMware1!' - | diff --git a/data/rules/weightsandbiases.yml b/data/rules/weightsandbiases.yml index bed9ca5..88bb37b 100644 --- a/data/rules/weightsandbiases.yml +++ b/data/rules/weightsandbiases.yml @@ -3,13 +3,15 @@ rules: id: kingfisher.wandb.1 pattern: | (?xi) - \b (?:wandb|weightsandbiases) (?:.|[\n\r]){0,16}? + \b ( [a-f0-9]{40} ) \b + pattern_requirements: + min_digits: 2 confidence: medium min_entropy: 3.5 examples: diff --git a/data/rules/wireguard.yml b/data/rules/wireguard.yml index 70aa1ae..47223c1 100644 --- a/data/rules/wireguard.yml +++ b/data/rules/wireguard.yml @@ -2,6 +2,8 @@ rules: - name: WireGuard Private Key id: kingfisher.wireguard.1 pattern: PrivateKey\s*=\s*([A-Za-z0-9+/]{43}=) + pattern_requirements: + min_digits: 2 min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/xAI.yml b/data/rules/xAI.yml index 398ac03..67a04ae 100644 --- a/data/rules/xAI.yml +++ b/data/rules/xAI.yml @@ -6,8 +6,10 @@ rules: \b ( xai-[A-Za-z0-9_-]{70,120} - ) + ) \b + pattern_requirements: + min_digits: 2 min_entropy: 3.8 confidence: medium examples: diff --git a/data/rules/zhipu.yml b/data/rules/zhipu.yml index bc1bffd..3d7ddab 100644 --- a/data/rules/zhipu.yml +++ b/data/rules/zhipu.yml @@ -10,6 +10,8 @@ rules: [A-Z0-9]{16} ) \b + pattern_requirements: + min_digits: 2 confidence: medium min_entropy: 4.0 examples: diff --git a/docs/RULES.md b/docs/RULES.md index 5a2fbfc..3b3e65a 100644 --- a/docs/RULES.md +++ b/docs/RULES.md @@ -38,6 +38,13 @@ rules: - rule_id: kingfisher.aws.id variable: AKID # referenced as {{ AKID }} + pattern_requirements: # (optional) character type requirements + min_digits: 1 # require at least 1 digit + min_uppercase: 1 # require at least 1 uppercase letter + min_lowercase: 1 # require at least 1 lowercase letter + min_special_chars: 1 # require at least 1 special character + special_chars: "!@#$%^&*()" # optional: custom special character set + validation: # (optional) live validation type: Http content: @@ -61,17 +68,18 @@ rules: - type: JsonValid ``` -| Field | What it does | -| ----------------- | -------------------------------------------------------------------- | -| name | Friendly name shown in reports | -| id | Unique text ID (namespace.v#) used internally | -| pattern | Regex used to spot secrets (free‑spacing & flags allowed) | -| min_entropy | Threshold to guard against low‑complexity false positives | -| confidence | Suggests severity: low → high | -| examples | Good matches; used for testing | -| visible | false to hide non‑secret captures (e.g. IDs) | -| depends_on_rule | Chain rules: use captures from one rule in another’s validation | -| validation | Configure HTTP, AWS, GCP, etc. checks to verify live validity | +| Field | What it does | +| ----------------------- | -------------------------------------------------------------------- | +| name | Friendly name shown in reports | +| id | Unique text ID (namespace.v#) used internally | +| pattern | Regex used to spot secrets (free‑spacing & flags allowed) | +| min_entropy | Threshold to guard against low‑complexity false positives | +| confidence | Suggests severity: low → high | +| examples | Good matches; used for testing | +| visible | false to hide non‑secret captures (e.g. IDs) | +| depends_on_rule | Chain rules: use captures from one rule in another's validation | +| pattern_requirements | Require specific character types (digits, uppercase, lowercase, special) | +| validation | Configure HTTP, AWS, GCP, etc. checks to verify live validity | *responser_matcher* variants. Multiple can be used @@ -237,6 +245,89 @@ For example, a rule might match a username, an email address, an AWS Access Key `visible: false` helps keep the scan output focused on actual secrets while still capturing important contextual data needed for comprehensive validation. +## Character Requirements + +The `pattern_requirements` field allows you to specify data type requirements for matched secrets. This is particularly useful when: + +- Your regex pattern must be permissive (due to Hyperscan limitations) +- You want to enforce password complexity requirements +- You need to filter out low-quality matches that lack certain character types + +Kingfisher's regex engine (Hyperscan) does not support lookahead assertions like `(?=.*\d)` to require specific character types. Instead, use the `pattern_requirements` field to filter matches post-detection. + +### Available Requirements + +```yaml +pattern_requirements: + min_digits: 1 # Require at least 1 digit (0-9) + min_uppercase: 1 # Require at least 1 uppercase letter (A-Z) + min_lowercase: 1 # Require at least 1 lowercase letter (a-z) + min_special_chars: 1 # Require at least 1 special character + special_chars: "!@#$%^&*" # Optional: define which characters are "special" +``` + +All fields are optional. If `special_chars` is not specified, the default set includes: `!@#$%^&*()_+-=[]{}|;:'",.<>?/\`~` + +### Example: Secure API Key + +```yaml +rules: + - name: Secure API Key + id: custom.secure_api.1 + pattern: | + (?xi) + api[_-]?key + (?:.|[\n\r]){0,32}? + \b + ([A-Za-z0-9!@#$%^&*]{20,}) + \b + min_entropy: 4.0 + confidence: high + pattern_requirements: + min_digits: 1 # Must contain at least 1 digit + min_uppercase: 1 # Must contain at least 1 uppercase letter + min_lowercase: 1 # Must contain at least 1 lowercase letter + min_special_chars: 1 # Must contain at least 1 special character + examples: + - api_key = "MyS3cur3K3y!2024" + - api-key: "Abc123!@#Token" +``` + +In this example: +- The regex pattern is permissive: `[A-Za-z0-9!@#$%^&*]{20,}` matches any combination of those characters +- The `pattern_requirements` filters out matches that don't have at least one of each required type +- A match like `"abcdefghijklmnopqrst"` would be rejected (no uppercase, no digit, no special) +- A match like `"Abc123!SecureToken"` would be accepted (has all required types) + +### Example: Custom Special Characters + +```yaml +rules: + - name: Token with Custom Special Chars + id: custom.token.1 + pattern: | + (?xi) + token + (?:.|[\n\r]){0,16}? + \b([A-Za-z0-9$%^]{16,})\b + min_entropy: 3.5 + confidence: medium + pattern_requirements: + min_special_chars: 2 + special_chars: "$%^" # Only these characters count as "special" + examples: + - token = "abc$%defgh123456" +``` + +### How It Works + +1. Hyperscan regex matches a pattern in the input +2. Entropy check filters low-complexity matches (if `min_entropy` is set) +3. **Character requirements check filters matches that don't meet the criteria** +4. Validation checks verify the secret is live (if `validation` is configured) + +Matches that fail the character requirements check are silently dropped with a debug log message. + ## Writing Custom Rules diff --git a/src/baseline.rs b/src/baseline.rs index 6f3dab5..68f5d93 100644 --- a/src/baseline.rs +++ b/src/baseline.rs @@ -147,6 +147,7 @@ mod tests { references: vec![], validation: None, depends_on_rule: vec![], + pattern_requirements: None, })) } diff --git a/src/matcher.rs b/src/matcher.rs index f93576a..7d73b76 100644 --- a/src/matcher.rs +++ b/src/matcher.rs @@ -602,6 +602,18 @@ fn filter_match<'b>( ); continue; } + + // Check character requirements if specified + if let Some(char_reqs) = rule.pattern_requirements() { + if !char_reqs.validate(mi_bytes) { + debug!( + "Skipping match that does not meet character requirements for rule {}", + rule.id() + ); + continue; + } + } + let matching_input_offset_span = OffsetSpan::from_range( (start + matching_input.start())..(start + matching_input.end()), ); @@ -1027,6 +1039,7 @@ mod test { references: vec![], validation: None::, // no HTTP validation needed depends_on_rule: vec![], + pattern_requirements: None, }); let rules_db = RulesDatabase::from_rules(vec![rule]).unwrap(); @@ -1098,6 +1111,7 @@ mod test { variable: "domain".to_string(), }), ], + pattern_requirements: None, })]; let rules_db = RulesDatabase::from_rules(rules)?; let input = "some test data for vectorscan"; @@ -1197,6 +1211,7 @@ mod test { references: vec![], validation: None::, depends_on_rule: vec![], + pattern_requirements: None, }); let rules_db = RulesDatabase::from_rules(vec![rule])?; @@ -1234,6 +1249,7 @@ mod test { references: vec![], validation: None::, depends_on_rule: vec![], + pattern_requirements: None, }); let rules_db = RulesDatabase::from_rules(vec![rule])?; let seen = BlobIdMap::new(); @@ -1266,6 +1282,7 @@ mod test { references: vec![], validation: None::, depends_on_rule: vec![], + pattern_requirements: None, }); let rules_db = RulesDatabase::from_rules(vec![rule])?; let seen = BlobIdMap::new(); @@ -1306,6 +1323,7 @@ line2 references: vec![], validation: None::, depends_on_rule: vec![], + pattern_requirements: None, }); let rules_db = RulesDatabase::from_rules(vec![rule])?; diff --git a/src/reporter.rs b/src/reporter.rs index 9ec0f66..bb96064 100644 --- a/src/reporter.rs +++ b/src/reporter.rs @@ -840,6 +840,7 @@ mod tests { references: vec![], validation: None, depends_on_rule: vec![], + pattern_requirements: None, })); let blob_id = BlobId::new(b"blob-data"); diff --git a/src/reporter/json_format.rs b/src/reporter/json_format.rs index 26d1b27..11c5b30 100644 --- a/src/reporter/json_format.rs +++ b/src/reporter/json_format.rs @@ -196,6 +196,7 @@ mod tests { references: vec![], validation: None, depends_on_rule: vec![], + pattern_requirements: None, }; let rule = Arc::new(Rule::new(syntax)); Match { diff --git a/src/rules/rule.rs b/src/rules/rule.rs index dbf52ef..399bd73 100644 --- a/src/rules/rule.rs +++ b/src/rules/rule.rs @@ -52,6 +52,73 @@ pub struct DependsOnRule { pub variable: String, } +/// Specifies character type requirements for matched secrets. +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord, Hash, Clone)] +pub struct PatternRequirements { + /// Minimum number of digits required (0-9) + #[serde(default)] + pub min_digits: Option, + /// Minimum number of uppercase letters required (A-Z) + #[serde(default)] + pub min_uppercase: Option, + /// Minimum number of lowercase letters required (a-z) + #[serde(default)] + pub min_lowercase: Option, + /// Minimum number of special characters required + #[serde(default)] + pub min_special_chars: Option, + /// Custom set of characters to consider as "special" (defaults to common punctuation) + #[serde(default)] + pub special_chars: Option, +} + +impl PatternRequirements { + /// Default special characters if none are specified + const DEFAULT_SPECIAL_CHARS: &'static str = "!@#$%^&*()_+-=[]{}|;:'\",.<>?/\\`~"; + + /// Validates whether the given byte slice meets the character requirements. + /// Returns true if all requirements are met, false otherwise. + pub fn validate(&self, input: &[u8]) -> bool { + // Convert to string (lossy for non-UTF8) + let s = String::from_utf8_lossy(input); + + // Check digit requirement + if let Some(min_digits) = self.min_digits { + let digit_count = s.chars().filter(|c| c.is_ascii_digit()).count(); + if digit_count < min_digits { + return false; + } + } + + // Check uppercase requirement + if let Some(min_uppercase) = self.min_uppercase { + let uppercase_count = s.chars().filter(|c| c.is_ascii_uppercase()).count(); + if uppercase_count < min_uppercase { + return false; + } + } + + // Check lowercase requirement + if let Some(min_lowercase) = self.min_lowercase { + let lowercase_count = s.chars().filter(|c| c.is_ascii_lowercase()).count(); + if lowercase_count < min_lowercase { + return false; + } + } + + // Check special character requirement + if let Some(min_special) = self.min_special_chars { + let special_chars = self.special_chars.as_deref().unwrap_or(Self::DEFAULT_SPECIAL_CHARS); + let special_count = s.chars().filter(|c| special_chars.contains(*c)).count(); + if special_count < min_special { + return false; + } + } + + true + } +} + /// Configuration for HTTP validation. This contains a request configuration /// and an optional multipart configuration. #[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord, Hash, Clone)] @@ -283,6 +350,9 @@ pub struct RuleSyntax { /// Optional dependencies on other rules. #[serde(default)] pub depends_on_rule: Vec>, + /// Optional character type requirements for matched secrets. + #[serde(default)] + pub pattern_requirements: Option, } lazy_static! { @@ -333,6 +403,7 @@ impl RuleSyntax { /// visible: true, /// validation: None, /// depends_on_rule: vec![], + /// pattern_requirements: None, /// }; /// assert_eq!(r.as_anchored_regex().unwrap().as_str(), r"hello\s*world$"); /// ``` @@ -436,4 +507,156 @@ impl Rule { pub fn confidence(&self) -> Confidence { self.syntax.confidence } + + /// Returns the character requirements for this rule, if any. + pub fn pattern_requirements(&self) -> Option<&PatternRequirements> { + self.syntax.pattern_requirements.as_ref() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_pattern_requirements_digits() { + let reqs = PatternRequirements { + min_digits: Some(2), + min_uppercase: None, + min_lowercase: None, + min_special_chars: None, + special_chars: None, + }; + + // Should pass: has 3 digits + assert!(reqs.validate(b"abc123def")); + + // Should fail: only 1 digit + assert!(!reqs.validate(b"abc1def")); + + // Should fail: no digits + assert!(!reqs.validate(b"abcdef")); + } + + #[test] + fn test_pattern_requirements_uppercase() { + let reqs = PatternRequirements { + min_digits: None, + min_uppercase: Some(2), + min_lowercase: None, + min_special_chars: None, + special_chars: None, + }; + + // Should pass: has 3 uppercase + assert!(reqs.validate(b"ABCdef")); + + // Should fail: only 1 uppercase + assert!(!reqs.validate(b"Adef")); + + // Should fail: no uppercase + assert!(!reqs.validate(b"abcdef")); + } + + #[test] + fn test_pattern_requirements_lowercase() { + let reqs = PatternRequirements { + min_digits: None, + min_uppercase: None, + min_lowercase: Some(2), + min_special_chars: None, + special_chars: None, + }; + + // Should pass: has 3 lowercase + assert!(reqs.validate(b"ABCdef")); + + // Should fail: only 1 lowercase + assert!(!reqs.validate(b"ABCd")); + + // Should fail: no lowercase + assert!(!reqs.validate(b"ABC123")); + } + + #[test] + fn test_pattern_requirements_special_chars() { + let reqs = PatternRequirements { + min_digits: None, + min_uppercase: None, + min_lowercase: None, + min_special_chars: Some(2), + special_chars: None, // uses default + }; + + // Should pass: has 2 special chars + assert!(reqs.validate(b"abc!@def")); + + // Should fail: only 1 special char + assert!(!reqs.validate(b"abc!def")); + + // Should fail: no special chars + assert!(!reqs.validate(b"abcdef")); + } + + #[test] + fn test_pattern_requirements_custom_special_chars() { + let reqs = PatternRequirements { + min_digits: None, + min_uppercase: None, + min_lowercase: None, + min_special_chars: Some(2), + special_chars: Some("$%^".to_string()), + }; + + // Should pass: has 2 custom special chars + assert!(reqs.validate(b"abc$%def")); + + // Should fail: has special chars but not the custom ones + assert!(!reqs.validate(b"abc!@def")); + + // Should fail: only 1 custom special char + assert!(!reqs.validate(b"abc$def")); + } + + #[test] + fn test_pattern_requirements_combined() { + let reqs = PatternRequirements { + min_digits: Some(1), + min_uppercase: Some(1), + min_lowercase: Some(1), + min_special_chars: Some(1), + special_chars: None, + }; + + // Should pass: has all requirements + assert!(reqs.validate(b"Abc1!")); + + // Should fail: missing digit + assert!(!reqs.validate(b"Abc!")); + + // Should fail: missing uppercase + assert!(!reqs.validate(b"abc1!")); + + // Should fail: missing lowercase + assert!(!reqs.validate(b"ABC1!")); + + // Should fail: missing special + assert!(!reqs.validate(b"Abc1")); + } + + #[test] + fn test_pattern_requirements_none() { + let reqs = PatternRequirements { + min_digits: None, + min_uppercase: None, + min_lowercase: None, + min_special_chars: None, + special_chars: None, + }; + + // Should pass: no requirements + assert!(reqs.validate(b"anything")); + assert!(reqs.validate(b"123")); + assert!(reqs.validate(b"!@#")); + } } diff --git a/testdata/test_char_requirements.yaml b/testdata/test_char_requirements.yaml new file mode 100644 index 0000000..f1f9801 --- /dev/null +++ b/testdata/test_char_requirements.yaml @@ -0,0 +1,24 @@ +rules: + - name: Secure API Key with Character Requirements + id: test.char_requirements.1 + pattern: | + (?xi) + api[_-]?key + (?:.|[\n\r]){0,32}? + \b + ([A-Za-z0-9!@#$%^&*]{16,}) + \b + min_entropy: 3.0 + confidence: high + pattern_requirements: + min_digits: 1 + min_uppercase: 1 + min_lowercase: 1 + min_special_chars: 1 + examples: + - api_key = "Abc123!SecureToken" + - api-key: "MyK3y!WithSpecial" + negative_examples: + - api_key = "abcdefghijklmnop" # no uppercase, no digit, no special + - api_key = "ABCDEFGHIJKLMNOP" # no lowercase, no digit, no special + - api_key = "abc123defghijklm" # no uppercase, no special diff --git a/tests/fingerprint_dedup.rs b/tests/fingerprint_dedup.rs index f458e7d..5771cc4 100644 --- a/tests/fingerprint_dedup.rs +++ b/tests/fingerprint_dedup.rs @@ -33,6 +33,7 @@ fn make_match(fp: u64) -> Match { references: vec![], validation: None, depends_on_rule: vec![], + pattern_requirements: None, }; let rule = Arc::new(Rule::new(syntax)); Match { From 8c9a0ad7711f8625c228f5edb56074277bca959f Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Tue, 4 Nov 2025 14:06:35 -0500 Subject: [PATCH 02/19] Automatically set --no-dedup whenever --manage-baseline is supplied so baseline management retains every occurrence of a finding --- CHANGELOG.md | 3 +++ README.md | 4 +++- src/cli/commands/scan.rs | 4 ++++ tests/smoke_baseline.rs | 33 +++++++++++++++++++++++++++++++++ 4 files changed, 43 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8555b69..023cc33 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,9 @@ All notable changes to this project will be documented in this file. ## [Unrelease] - pattern_requirements for rules — Post-regex character-class gating to cut false positives without lookarounds. Authors can now require minimum counts of digits, uppercase, lowercase, and special characters, with an optional custom special-char set. Why? Hyperscan doesn’t support lookaheads/behinds, so many "must contain X and Y" checks had to be baked into the regex (hurting readability) or were impossible. `pattern_requirements` applies lightweight, in-memory checks after a match is found, keeping patterns fast and clean. +- updated rules with support for `pattern_requirements` +- Automatically set `--no-dedup` whenever `--manage-baseline` is supplied so baseline management retains every occurrence of a finding + ## [v1.61.0] - Fixed local filesystem scans to keep `open_path_as_is` enabled when opening Git repositories and only disable it for diff-based scans. - Created Linux and Windows specific installer script diff --git a/README.md b/README.md index 29a11e8..5924c55 100644 --- a/README.md +++ b/README.md @@ -1083,6 +1083,8 @@ kingfisher scan /path/to/code \ --baseline-file ./baseline-file.yml ``` +`--manage-baseline` automatically enables `--no-dedup` so the baseline captures every individual occurrence. + Use the same YAML file with the `--baseline-file` option on future scans to hide all recorded findings: ```bash @@ -1159,7 +1161,7 @@ leaves the default unchanged. - `--redact`: Replaces discovered secrets with a one-way hash for secure output - `--exclude `: Skip any file or directory whose path matches this glob pattern (repeatable, uses gitignore-style syntax, case sensitive) - `--baseline-file `: Ignore matches listed in a baseline YAML file -- `--manage-baseline`: Create or update the baseline file with current findings +- `--manage-baseline`: Create or update the baseline file with current findings (automatically enables `--no-dedup`) - `--skip-regex `: Ignore findings whose text matches this regex (repeatable) - `--skip-word `: Ignore findings containing this case-insensitive word (repeatable) - `--skip-aws-account `: Skip live AWS validation for findings tied to the specified AWS account number (repeatable, accepts comma-separated lists) diff --git a/src/cli/commands/scan.rs b/src/cli/commands/scan.rs index 8deb595..1f0aaf5 100644 --- a/src/cli/commands/scan.rs +++ b/src/cli/commands/scan.rs @@ -416,6 +416,10 @@ impl ScanCommandArgs { self.scan_args.input_specifier_args.emit_deprecated_warnings(); } + if self.scan_args.manage_baseline { + self.scan_args.no_dedup = true; + } + Ok(ScanOperation::Scan(self.scan_args)) } } diff --git a/tests/smoke_baseline.rs b/tests/smoke_baseline.rs index f69be7c..d6dbf5b 100644 --- a/tests/smoke_baseline.rs +++ b/tests/smoke_baseline.rs @@ -6,6 +6,39 @@ use tempfile::tempdir; const GH_PAT: &str = "ghp_1wuHFikBKQtCcH3EB2FBUkyn8krXhP2qLqPa"; +#[test] +fn manage_baseline_enables_no_dedup() -> anyhow::Result<()> { + use kingfisher::cli::{ + commands::scan::ScanOperation, + global::{Command, CommandLineArgs}, + }; + + let dir = tempdir()?; + + let args = CommandLineArgs::try_parse_from([ + "kingfisher", + "scan", + dir.path().to_str().unwrap(), + "--manage-baseline", + "--no-update-check", + ])?; + + let command = match args.command { + Command::Scan(scan_args) => scan_args, + other => panic!("unexpected command parsed: {:?}", other), + }; + + let scan_args = match command.into_operation()? { + ScanOperation::Scan(scan_args) => scan_args, + op => panic!("expected scan operation, got {:?}", op), + }; + + assert!(scan_args.manage_baseline); + assert!(scan_args.no_dedup); + + Ok(()) +} + #[test] fn baseline_create_and_filter() -> anyhow::Result<()> { let dir = tempdir()?; From 3b3a4e50307dee4d72f1497f57d0865cb0902f89 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Tue, 4 Nov 2025 14:15:04 -0500 Subject: [PATCH 03/19] Added an optional exclude_words list to PatternRequirements so matches containing case-insensitive placeholder words are filtered out, with accompanying tests to cover the new behavior. --- CHANGELOG.md | 5 ++-- docs/RULES.md | 32 ++++++++++++++++++++-- src/matcher.rs | 49 ++++++++++++++++++++++++++++++++- src/rules/rule.rs | 69 ++++++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 149 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 023cc33..817672e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,10 +2,11 @@ All notable changes to this project will be documented in this file. -## [Unrelease] +## [Unreleased] - pattern_requirements for rules — Post-regex character-class gating to cut false positives without lookarounds. Authors can now require minimum counts of digits, uppercase, lowercase, and special characters, with an optional custom special-char set. Why? Hyperscan doesn’t support lookaheads/behinds, so many "must contain X and Y" checks had to be baked into the regex (hurting readability) or were impossible. `pattern_requirements` applies lightweight, in-memory checks after a match is found, keeping patterns fast and clean. -- updated rules with support for `pattern_requirements` +- Updated many rules with `pattern_requirements` - Automatically set `--no-dedup` whenever `--manage-baseline` is supplied so baseline management retains every occurrence of a finding +- Added an optional `exclude_words` list to `PatternRequirements` so matches containing case-insensitive placeholder words are filtered out, with accompanying tests to cover the new behavior. ## [v1.61.0] - Fixed local filesystem scans to keep `open_path_as_is` enabled when opening Git repositories and only disable it for diff-based scans. diff --git a/docs/RULES.md b/docs/RULES.md index 3b3e65a..816dbe7 100644 --- a/docs/RULES.md +++ b/docs/RULES.md @@ -38,12 +38,14 @@ rules: - rule_id: kingfisher.aws.id variable: AKID # referenced as {{ AKID }} - pattern_requirements: # (optional) character type requirements + pattern_requirements: # (optional) character/word requirements min_digits: 1 # require at least 1 digit min_uppercase: 1 # require at least 1 uppercase letter min_lowercase: 1 # require at least 1 lowercase letter min_special_chars: 1 # require at least 1 special character special_chars: "!@#$%^&*()" # optional: custom special character set + exclude_words: # optional: drop matches containing these words + - test validation: # (optional) live validation type: Http @@ -78,7 +80,7 @@ rules: | examples | Good matches; used for testing | | visible | false to hide non‑secret captures (e.g. IDs) | | depends_on_rule | Chain rules: use captures from one rule in another's validation | -| pattern_requirements | Require specific character types (digits, uppercase, lowercase, special) | +| pattern_requirements | Require character types and/or exclude placeholder words from matches | | validation | Configure HTTP, AWS, GCP, etc. checks to verify live validity | @@ -264,10 +266,15 @@ pattern_requirements: min_lowercase: 1 # Require at least 1 lowercase letter (a-z) min_special_chars: 1 # Require at least 1 special character special_chars: "!@#$%^&*" # Optional: define which characters are "special" + exclude_words: # Optional: reject matches containing any of these (case-insensitive) + - test + - demo ``` All fields are optional. If `special_chars` is not specified, the default set includes: `!@#$%^&*()_+-=[]{}|;:'",.<>?/\`~` +`exclude_words` performs a case-insensitive substring check. If any entry (after trimming whitespace) appears within the match, the match is discarded. This is helpful for dropping known dummy tokens such as "test" or "demo" that otherwise satisfy the regex. + ### Example: Secure API Key ```yaml @@ -288,6 +295,8 @@ rules: min_uppercase: 1 # Must contain at least 1 uppercase letter min_lowercase: 1 # Must contain at least 1 lowercase letter min_special_chars: 1 # Must contain at least 1 special character + exclude_words: + - test examples: - api_key = "MyS3cur3K3y!2024" - api-key: "Abc123!@#Token" @@ -298,6 +307,25 @@ In this example: - The `pattern_requirements` filters out matches that don't have at least one of each required type - A match like `"abcdefghijklmnopqrst"` would be rejected (no uppercase, no digit, no special) - A match like `"Abc123!SecureToken"` would be accepted (has all required types) +- A match like `"Test123!SecureToken"` would be rejected because it contains the excluded word `test` + +### Example: Excluding Dummy Values + +```yaml +rules: + - name: Token without placeholders + id: custom.token.2 + pattern: |- + (?i)token[:=]\s*([A-Za-z0-9]{12,}) + pattern_requirements: + exclude_words: + - placeholder + - sample + examples: + - token: "REALVALUE1234" + negative_examples: + - token = "SAMPLETOKEN9999" # dropped by exclude_words +``` ### Example: Custom Special Characters diff --git a/src/matcher.rs b/src/matcher.rs index 7d73b76..88b601b 100644 --- a/src/matcher.rs +++ b/src/matcher.rs @@ -1004,7 +1004,9 @@ mod test { use crate::{ blob::{Blob, BlobIdMap}, origin::{Origin, OriginSet}, - rules::rule::{DependsOnRule, HttpRequest, HttpValidation, RuleSyntax, Validation}, + rules::rule::{ + DependsOnRule, HttpRequest, HttpValidation, PatternRequirements, RuleSyntax, Validation, + }, }; proptest! { @@ -1138,6 +1140,51 @@ mod test { Ok(()) } + #[test] + fn test_pattern_requirements_exclude_words_filters_matches() -> Result<()> { + let rules = vec![Rule::new(RuleSyntax { + id: "test.exclude".to_string(), + name: "exclude words".to_string(), + pattern: "(?Pprefix[A-Za-z]+)".to_string(), + confidence: crate::rules::rule::Confidence::Medium, + min_entropy: 0.0, + visible: true, + examples: vec![], + negative_examples: vec![], + references: vec![], + validation: None, + depends_on_rule: vec![], + pattern_requirements: Some(PatternRequirements { + min_digits: None, + min_uppercase: None, + min_lowercase: None, + min_special_chars: None, + special_chars: None, + exclude_words: Some(vec!["TEST".to_string()]), + }), + })]; + + let rules_db = RulesDatabase::from_rules(rules)?; + let input = b"prefixgood prefixtest"; + let seen_blobs: BlobIdMap = BlobIdMap::new(); + let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); + let mut matcher = + Matcher::new(&rules_db, scanner_pool, &seen_blobs, None, false, None, &[], false)?; + + matcher.scan_bytes_raw(input, "fname")?; + + let matches = &matcher.user_data.raw_matches_scratch; + assert_eq!(matches.len(), 1, "exclude_words should drop filtered matches"); + let RawMatch { start_idx, end_idx, .. } = matches[0]; + assert_eq!( + &input[start_idx as usize..end_idx as usize], + b"prefixgood", + "remaining match should be the non-excluded token", + ); + + Ok(()) + } + // --------------------------------------------------------------------- // additional deterministic unit-tests // --------------------------------------------------------------------- diff --git a/src/rules/rule.rs b/src/rules/rule.rs index 399bd73..051adc3 100644 --- a/src/rules/rule.rs +++ b/src/rules/rule.rs @@ -70,6 +70,9 @@ pub struct PatternRequirements { /// Custom set of characters to consider as "special" (defaults to common punctuation) #[serde(default)] pub special_chars: Option, + /// Words that should cause the match to be excluded when present (case-insensitive) + #[serde(default)] + pub exclude_words: Option>, } impl PatternRequirements { @@ -108,13 +111,33 @@ impl PatternRequirements { // Check special character requirement if let Some(min_special) = self.min_special_chars { - let special_chars = self.special_chars.as_deref().unwrap_or(Self::DEFAULT_SPECIAL_CHARS); + let special_chars = + self.special_chars.as_deref().unwrap_or(Self::DEFAULT_SPECIAL_CHARS); let special_count = s.chars().filter(|c| special_chars.contains(*c)).count(); if special_count < min_special { return false; } } + // Check exclude words requirement + if let Some(exclude_words) = self.exclude_words.as_ref() { + let lowercase_input = s.to_lowercase(); + if exclude_words + .iter() + .filter_map(|word| { + let trimmed = word.trim(); + if trimmed.is_empty() { + None + } else { + Some(trimmed.to_lowercase()) + } + }) + .any(|word| lowercase_input.contains(&word)) + { + return false; + } + } + true } } @@ -526,6 +549,7 @@ mod tests { min_lowercase: None, min_special_chars: None, special_chars: None, + exclude_words: None, }; // Should pass: has 3 digits @@ -546,6 +570,7 @@ mod tests { min_lowercase: None, min_special_chars: None, special_chars: None, + exclude_words: None, }; // Should pass: has 3 uppercase @@ -566,6 +591,7 @@ mod tests { min_lowercase: Some(2), min_special_chars: None, special_chars: None, + exclude_words: None, }; // Should pass: has 3 lowercase @@ -586,6 +612,7 @@ mod tests { min_lowercase: None, min_special_chars: Some(2), special_chars: None, // uses default + exclude_words: None, }; // Should pass: has 2 special chars @@ -606,6 +633,7 @@ mod tests { min_lowercase: None, min_special_chars: Some(2), special_chars: Some("$%^".to_string()), + exclude_words: None, }; // Should pass: has 2 custom special chars @@ -626,6 +654,7 @@ mod tests { min_lowercase: Some(1), min_special_chars: Some(1), special_chars: None, + exclude_words: None, }; // Should pass: has all requirements @@ -644,6 +673,43 @@ mod tests { assert!(!reqs.validate(b"Abc1")); } + #[test] + fn test_pattern_requirements_exclude_words() { + let reqs = PatternRequirements { + min_digits: None, + min_uppercase: None, + min_lowercase: None, + min_special_chars: None, + special_chars: None, + exclude_words: Some(vec!["test".to_string(), "Demo".to_string()]), + }; + + // Should fail: contains "test" (case-insensitive) + assert!(!reqs.validate(b"MyTestToken")); + + // Should fail: contains "demo" (case-insensitive) + assert!(!reqs.validate(b"example-demo-value")); + + // Should pass: does not contain excluded words + assert!(reqs.validate(b"example-value")); + } + + #[test] + fn test_pattern_requirements_exclude_words_ignores_empty_entries() { + let reqs = PatternRequirements { + min_digits: None, + min_uppercase: None, + min_lowercase: None, + min_special_chars: None, + special_chars: None, + exclude_words: Some(vec![" ".to_string(), "".to_string(), "BLOCK".to_string()]), + }; + + // Should fail only when non-empty exclusion matches + assert!(!reqs.validate(b"needs-blocking")); + assert!(reqs.validate(b"allowed")); + } + #[test] fn test_pattern_requirements_none() { let reqs = PatternRequirements { @@ -652,6 +718,7 @@ mod tests { min_lowercase: None, min_special_chars: None, special_chars: None, + exclude_words: None, }; // Should pass: no requirements From c89841a958ae5588edbede547857216d01ee3028 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Tue, 4 Nov 2025 14:16:23 -0500 Subject: [PATCH 04/19] Added an optional exclude_words list to PatternRequirements so matches containing case-insensitive placeholder words are filtered out, with accompanying tests to cover the new behavior. --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 817672e..4337324 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,10 +3,10 @@ All notable changes to this project will be documented in this file. ## [Unreleased] -- pattern_requirements for rules — Post-regex character-class gating to cut false positives without lookarounds. Authors can now require minimum counts of digits, uppercase, lowercase, and special characters, with an optional custom special-char set. Why? Hyperscan doesn’t support lookaheads/behinds, so many "must contain X and Y" checks had to be baked into the regex (hurting readability) or were impossible. `pattern_requirements` applies lightweight, in-memory checks after a match is found, keeping patterns fast and clean. +- Added `pattern_requirements` for rules. Enables post-regex character-class checks (digits, uppercase, lowercase, specials) to reduce false positives without lookarounds. Provides lightweight, in-memory validation after matches, keeping patterns fast and readable. +- Added an optional `exclude_words` list to `PatternRequirements` so matches containing case-insensitive placeholder words are filtered out, with accompanying tests to cover the new behavior. - Updated many rules with `pattern_requirements` - Automatically set `--no-dedup` whenever `--manage-baseline` is supplied so baseline management retains every occurrence of a finding -- Added an optional `exclude_words` list to `PatternRequirements` so matches containing case-insensitive placeholder words are filtered out, with accompanying tests to cover the new behavior. ## [v1.61.0] - Fixed local filesystem scans to keep `open_path_as_is` enabled when opening Git repositories and only disable it for diff-based scans. From 046ac6a0520d15d9b95fa54d76720a104fdac341 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Tue, 4 Nov 2025 13:07:24 -0800 Subject: [PATCH 05/19] Added an optional exclude_words list to PatternRequirements so matches containing case-insensitive placeholder words are filtered out, with accompanying tests to cover the new behavior. --- data/rules/mongodb.yml | 4 + src/matcher.rs | 18 ++- tests/cli.rs | 9 +- tests/cli_failure.rs | 9 +- tests/cli_subcommands.rs | 186 ++++++++--------------- tests/int_base64.rs | 8 +- tests/int_quiet.rs | 6 +- tests/int_rules_no_validated_findings.rs | 2 +- tests/smoke_archive.rs | 4 +- tests/smoke_baseline.rs | 11 +- tests/smoke_branch.rs | 10 +- tests/smoke_check_rules.rs | 2 +- tests/smoke_docker.rs | 2 +- tests/smoke_exclude.rs | 4 +- tests/smoke_fs.rs | 2 +- tests/smoke_git.rs | 2 +- tests/smoke_github_homebrew.rs | 2 +- 17 files changed, 113 insertions(+), 168 deletions(-) diff --git a/data/rules/mongodb.yml b/data/rules/mongodb.yml index 28b4031..f6696c3 100644 --- a/data/rules/mongodb.yml +++ b/data/rules/mongodb.yml @@ -81,6 +81,10 @@ rules: mongodb(?:\+srv)?://[\S]{3,50}:(?:[\S]{3,88})@[-.%\w/:]+ ) \b + pattern_requirements: + exclude_words: + - "@localhost" + - "@127.0.0.1" min_entropy: 3 examples: - client = mongoc_client_new ("mongodb+srv://someuser:hunter2@my-atlas-rd941.mongodb.net/test?retryWrites=true&w=majority"); diff --git a/src/matcher.rs b/src/matcher.rs index 88b601b..e572c8b 100644 --- a/src/matcher.rs +++ b/src/matcher.rs @@ -1171,13 +1171,22 @@ mod test { let mut matcher = Matcher::new(&rules_db, scanner_pool, &seen_blobs, None, false, None, &[], false)?; - matcher.scan_bytes_raw(input, "fname")?; + let blob = Blob::from_bytes(input.to_vec()); + let origin = OriginSet::from(Origin::from_file(PathBuf::from("exclude.txt"))); + + let matches = match matcher.scan_blob(&blob, &origin, None, false, false, false)? { + ScanResult::New(matches) => matches, + ScanResult::SeenWithMatches => { + panic!("unexpected scan result: blob should not be considered previously seen with matches") + } + ScanResult::SeenSansMatches => { + panic!("unexpected scan result: blob should not be considered previously seen without matches") + } + }; - let matches = &matcher.user_data.raw_matches_scratch; assert_eq!(matches.len(), 1, "exclude_words should drop filtered matches"); - let RawMatch { start_idx, end_idx, .. } = matches[0]; assert_eq!( - &input[start_idx as usize..end_idx as usize], + matches[0].matching_input, b"prefixgood", "remaining match should be the non-excluded token", ); @@ -1185,6 +1194,7 @@ mod test { Ok(()) } + // --------------------------------------------------------------------- // additional deterministic unit-tests // --------------------------------------------------------------------- diff --git a/tests/cli.rs b/tests/cli.rs index 0486b73..ee845cd 100644 --- a/tests/cli.rs +++ b/tests/cli.rs @@ -6,8 +6,7 @@ mod test { use super::*; #[test] fn cli_lists_rules_pretty() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args(["rules", "list", "--format", "pretty", "--no-update-check"]) .assert() .success() @@ -15,8 +14,7 @@ mod test { } #[test] fn cli_lists_rules_json() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args(["rules", "list", "--format", "json", "--no-update-check"]) .assert() .success() @@ -25,8 +23,7 @@ mod test { #[test] fn cli_version_flag() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .arg("--version") .assert() .success() diff --git a/tests/cli_failure.rs b/tests/cli_failure.rs index ab352f6..0968f4f 100644 --- a/tests/cli_failure.rs +++ b/tests/cli_failure.rs @@ -8,8 +8,7 @@ use tempfile::TempDir; /// 1. Path-does-not-exist ⇒ run_async_scan bails with “Invalid input” #[test] fn scan_fails_for_missing_path() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args(["scan", "no/such/path/here", "--no-update-check"]) .assert() .failure() // exit-code ≠ 0 @@ -22,8 +21,7 @@ fn scan_fails_for_bad_rule_yaml() { let tmp = TempDir::new().unwrap(); fs::write(tmp.path().join("broken.yml"), "this: is: : not yaml").unwrap(); - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", tmp.path().to_str().unwrap(), // dummy input dir (exists) @@ -68,8 +66,7 @@ rules: // Create a dummy input file that matches the rule fs::write(tmp.path().join("input.txt"), "dummy_dead").unwrap(); - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", tmp.path().join("input.txt").to_str().unwrap(), diff --git a/tests/cli_subcommands.rs b/tests/cli_subcommands.rs index 2be09ab..b51fc29 100644 --- a/tests/cli_subcommands.rs +++ b/tests/cli_subcommands.rs @@ -19,8 +19,7 @@ mod github { #[test] fn scan_github_help() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args(["scan", "github", "--help"]) .assert() .success() @@ -29,8 +28,7 @@ mod github { #[test] fn scan_github_list_only_help() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args(["scan", "github", "--help"]) .assert() .success() @@ -42,8 +40,7 @@ mod github { #[test] fn scan_github_requires_specifier() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args(["scan", "github", "--no-update-check"]) .assert() .failure() @@ -52,8 +49,7 @@ mod github { #[test] fn scan_github_with_user() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args(["scan", "github", "--user", "testuser", "--list-only", "--no-update-check"]) .assert() .code(predicates::function::function(|code: &i32| { @@ -64,8 +60,7 @@ mod github { #[test] fn scan_github_with_organization() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", "github", @@ -80,8 +75,7 @@ mod github { #[test] fn scan_github_multiple_users() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", "github", @@ -98,8 +92,7 @@ mod github { #[test] fn scan_github_with_exclude() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", "github", @@ -116,8 +109,7 @@ mod github { #[test] fn scan_github_with_repo_type_fork() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", "github", @@ -134,8 +126,7 @@ mod github { #[test] fn scan_github_with_repo_type_source() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", "github", @@ -152,8 +143,7 @@ mod github { #[test] fn scan_github_custom_api_url() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", "github", @@ -170,8 +160,7 @@ mod github { #[test] fn scan_github_all_organizations() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", "github", @@ -187,8 +176,7 @@ mod github { #[test] fn scan_github_invalid_repo_type() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", "github", @@ -205,8 +193,7 @@ mod github { #[test] fn scan_github_mixed_user_and_org() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", "github", @@ -231,8 +218,7 @@ mod gitlab { #[test] fn scan_gitlab_help() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args(["scan", "gitlab", "--help"]) .assert() .success() @@ -241,8 +227,7 @@ mod gitlab { #[test] fn scan_gitlab_list_only_flag() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args(["scan", "gitlab", "--help"]) .assert() .success() @@ -251,8 +236,7 @@ mod gitlab { #[test] fn scan_gitlab_requires_specifier() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args(["scan", "gitlab", "--no-update-check"]) .assert() .failure() @@ -261,8 +245,7 @@ mod gitlab { #[test] fn scan_gitlab_with_user() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args(["scan", "gitlab", "--user", "testuser", "--list-only", "--no-update-check"]) .assert() .code(predicates::function::function(|code: &i32| *code == 0 || *code == 1)); @@ -270,8 +253,7 @@ mod gitlab { #[test] fn scan_gitlab_with_group() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args(["scan", "gitlab", "--group", "testgroup", "--list-only", "--no-update-check"]) .assert() .code(predicates::function::function(|code: &i32| *code == 0 || *code == 1)); @@ -279,8 +261,7 @@ mod gitlab { #[test] fn scan_gitlab_with_include_subgroups() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", "gitlab", @@ -296,8 +277,7 @@ mod gitlab { #[test] fn scan_gitlab_with_repo_type() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", "gitlab", @@ -314,8 +294,7 @@ mod gitlab { #[test] fn scan_gitlab_with_exclude() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", "gitlab", @@ -332,8 +311,7 @@ mod gitlab { #[test] fn scan_gitlab_custom_api_url() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", "gitlab", @@ -350,8 +328,7 @@ mod gitlab { #[test] fn scan_gitlab_all_groups() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", "gitlab", @@ -375,8 +352,7 @@ mod azure { #[test] fn scan_azure_help() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args(["scan", "azure", "--help"]) .assert() .success() @@ -385,8 +361,7 @@ mod azure { #[test] fn scan_azure_list_only_flag() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args(["scan", "azure", "--help"]) .assert() .success() @@ -395,8 +370,7 @@ mod azure { #[test] fn scan_azure_requires_specifier() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args(["scan", "azure", "--no-update-check"]) .assert() .failure() @@ -405,8 +379,7 @@ mod azure { #[test] fn scan_azure_with_organization() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", "azure", @@ -421,8 +394,7 @@ mod azure { #[test] fn scan_azure_with_project() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", "azure", @@ -437,8 +409,7 @@ mod azure { #[test] fn scan_azure_all_projects() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", "azure", @@ -454,8 +425,7 @@ mod azure { #[test] fn scan_azure_with_exclude() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", "azure", @@ -472,8 +442,7 @@ mod azure { #[test] fn scan_azure_with_repo_type() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", "azure", @@ -498,8 +467,7 @@ mod bitbucket { #[test] fn scan_bitbucket_help() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args(["scan", "bitbucket", "--help"]) .assert() .stdout(is_match(r"kingfisher(\.exe)? scan bitbucket \[OPTIONS\]").unwrap()); @@ -507,8 +475,7 @@ mod bitbucket { #[test] fn scan_bitbucket_list_only_flag() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args(["scan", "bitbucket", "--help"]) .assert() .success() @@ -517,8 +484,7 @@ mod bitbucket { #[test] fn scan_bitbucket_requires_specifier() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args(["scan", "bitbucket", "--no-update-check"]) .assert() .failure() @@ -527,8 +493,7 @@ mod bitbucket { #[test] fn scan_bitbucket_with_workspace() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", "bitbucket", @@ -543,8 +508,7 @@ mod bitbucket { #[test] fn scan_bitbucket_with_user() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args(["scan", "bitbucket", "--user", "testuser", "--list-only", "--no-update-check"]) .assert() .code(predicates::function::function(|code: &i32| *code == 0 || *code == 1)); @@ -552,8 +516,7 @@ mod bitbucket { #[test] fn scan_bitbucket_with_project() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", "bitbucket", @@ -568,8 +531,7 @@ mod bitbucket { #[test] fn scan_bitbucket_all_workspaces() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", "bitbucket", @@ -585,8 +547,7 @@ mod bitbucket { #[test] fn scan_bitbucket_with_exclude() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", "bitbucket", @@ -603,8 +564,7 @@ mod bitbucket { #[test] fn scan_bitbucket_with_repo_type() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", "bitbucket", @@ -629,8 +589,7 @@ mod gitea { #[test] fn scan_gitea_help() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args(["scan", "gitea", "--help"]) .assert() .success() @@ -639,8 +598,7 @@ mod gitea { #[test] fn scan_gitea_requires_specifier() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args(["scan", "gitea", "--no-update-check"]) .assert() .failure() @@ -649,8 +607,7 @@ mod gitea { #[test] fn scan_gitea_with_organization() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", "gitea", @@ -665,8 +622,7 @@ mod gitea { #[test] fn scan_gitea_with_user() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args(["scan", "gitea", "--user", "testuser", "--list-only", "--no-update-check"]) .assert() .code(predicates::function::function(|code: &i32| *code == 0 || *code == 1)); @@ -674,8 +630,7 @@ mod gitea { #[test] fn scan_gitea_all_organizations() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", "gitea", @@ -689,8 +644,7 @@ mod gitea { #[test] fn scan_gitea_with_exclude() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", "gitea", @@ -707,8 +661,7 @@ mod gitea { #[test] fn scan_gitea_with_repo_type() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", "gitea", @@ -725,8 +678,7 @@ mod gitea { #[test] fn scan_gitea_custom_api_url() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", "gitea", @@ -751,8 +703,7 @@ mod huggingface { #[test] fn scan_huggingface_help() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args(["scan", "huggingface", "--help"]) .assert() .success() @@ -761,8 +712,7 @@ mod huggingface { #[test] fn scan_huggingface_requires_specifier() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args(["scan", "huggingface", "--no-update-check"]) .assert() .failure() @@ -771,8 +721,7 @@ mod huggingface { #[test] fn scan_huggingface_with_user() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", "huggingface", @@ -787,8 +736,7 @@ mod huggingface { #[test] fn scan_huggingface_with_organization() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", "huggingface", @@ -803,8 +751,7 @@ mod huggingface { #[test] fn scan_huggingface_with_model() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", "huggingface", @@ -819,8 +766,7 @@ mod huggingface { #[test] fn scan_huggingface_with_dataset() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", "huggingface", @@ -835,8 +781,7 @@ mod huggingface { #[test] fn scan_huggingface_with_space() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", "huggingface", @@ -851,8 +796,7 @@ mod huggingface { #[test] fn scan_huggingface_with_exclude() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", "huggingface", @@ -888,8 +832,7 @@ mod cross_platform { for mut platform_args in platforms { platform_args.extend_from_slice(&["--list-only", "--no-update-check"]); - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args(&platform_args) .assert() .code(predicates::function::function(|code: &i32| *code == 0 || *code == 1)); @@ -898,8 +841,7 @@ mod cross_platform { #[test] fn scan_requires_subcommand_or_path() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args(["scan", "--no-update-check"]) .assert() .failure() @@ -908,8 +850,7 @@ mod cross_platform { #[test] fn scan_invalid_platform_subcommand() { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args(["scan", "invalid-platform", "--no-update-check"]) .assert() .failure() @@ -919,8 +860,7 @@ mod cross_platform { #[test] fn scan_github_without_scanning_no_paths() { // list-only should work without providing actual scan paths - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args(["scan", "github", "--user", "testuser", "--list-only", "--no-update-check"]) .assert() .code(predicates::function::function(|code: &i32| *code == 0 || *code == 1)); @@ -941,8 +881,7 @@ mod legacy_compatibility { let root = PathBuf::from(env!("CARGO_MANIFEST_DIR")); let test_file = root.join("testdata").join("generic_secrets.py"); - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", test_file.to_str().expect("REASON"), @@ -962,8 +901,7 @@ mod legacy_compatibility { // This just validates the CLI parsing works // New syntax - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args(["scan", "github", "--user", "test", "--list-only", "--no-update-check"]) .assert() .code(predicates::function::function(|code: &i32| *code == 0 || *code == 1)); diff --git a/tests/int_base64.rs b/tests/int_base64.rs index c580451..be06570 100644 --- a/tests/int_base64.rs +++ b/tests/int_base64.rs @@ -12,7 +12,7 @@ fn detects_base64_encoded_secret() -> anyhow::Result<()> { let encoded = "Z2hwXzF3dUhGaWtCS1F0Q2NIM0VCMkZCVWt5bjhrclhoUDJxTHFQYQ=="; fs::write(&file_path, encoded)?; - Command::cargo_bin("kingfisher")? + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", dir.path().to_str().unwrap(), @@ -42,7 +42,7 @@ fn skips_base64_when_disabled() -> anyhow::Result<()> { let encoded = "Z2hwXzF3dUhGaWtCS1F0Q2NIM0VCMkZCVWt5bjhrclhoUDJxTHFQYQ=="; fs::write(&file_path, encoded)?; - Command::cargo_bin("kingfisher")? + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", dir.path().to_str().unwrap(), @@ -68,7 +68,7 @@ fn no_base64_skips_empty_files() -> anyhow::Result<()> { let file_path = dir.path().join("empty.py"); fs::write(&file_path, "")?; - Command::cargo_bin("kingfisher")? + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", dir.path().to_str().unwrap(), @@ -96,7 +96,7 @@ fn detects_base64_in_code_with_tree_sitter() -> anyhow::Result<()> { let encoded = "Z2hwXzF3dUhGaWtCS1F0Q2NIM0VCMkZCVWt5bjhrclhoUDJxTHFQYQ=="; fs::write(&file_path, format!("token = \"{}\"\n", encoded))?; - Command::cargo_bin("kingfisher")? + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", dir.path().to_str().unwrap(), diff --git a/tests/int_quiet.rs b/tests/int_quiet.rs index 1968972..9b3c403 100644 --- a/tests/int_quiet.rs +++ b/tests/int_quiet.rs @@ -10,8 +10,7 @@ fn contains_bytes(haystack: &[u8], needle: &[u8]) -> bool { #[test] fn scan_quiet_suppresses_summary() { for format in FORMATS { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .env("NO_COLOR", "1") .args([ "scan", @@ -35,8 +34,7 @@ fn scan_quiet_suppresses_summary() { #[test] fn scan_quiet_with_rule_stats_prints_rule_stats() { for format in FORMATS { - Command::cargo_bin("kingfisher") - .unwrap() + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .env("NO_COLOR", "1") .args([ "scan", diff --git a/tests/int_rules_no_validated_findings.rs b/tests/int_rules_no_validated_findings.rs index bd0f4f7..a8531c2 100644 --- a/tests/int_rules_no_validated_findings.rs +++ b/tests/int_rules_no_validated_findings.rs @@ -4,7 +4,7 @@ use serde_json::Value; #[test] fn scan_rules_has_no_validated_findings() -> Result<()> { - let output = Command::cargo_bin("kingfisher")? + let output = Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args(["scan", "data/rules", "--format", "json", "--no-update-check", "--only-valid"]) .output()?; diff --git a/tests/smoke_archive.rs b/tests/smoke_archive.rs index c2afc7f..1ee547e 100644 --- a/tests/smoke_archive.rs +++ b/tests/smoke_archive.rs @@ -29,7 +29,7 @@ fn smoke_scan_tar_gz_archive() -> anyhow::Result<()> { let findings_code = 200; // ── 1) extraction ENABLED -- secret should be found ───────────────────────── - Command::cargo_bin("kingfisher")? + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", tar_gz.to_str().unwrap(), @@ -43,7 +43,7 @@ fn smoke_scan_tar_gz_archive() -> anyhow::Result<()> { .stdout(predicates::str::contains(github_pat)); // ── 2) extraction DISABLED -- secret *not* found ──────────────────────────── - Command::cargo_bin("kingfisher")? + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", tar_gz.to_str().unwrap(), diff --git a/tests/smoke_baseline.rs b/tests/smoke_baseline.rs index d6dbf5b..5c36797 100644 --- a/tests/smoke_baseline.rs +++ b/tests/smoke_baseline.rs @@ -3,6 +3,7 @@ use std::fs; use assert_cmd::Command; use predicates::prelude::*; use tempfile::tempdir; +use clap::Parser; const GH_PAT: &str = "ghp_1wuHFikBKQtCcH3EB2FBUkyn8krXhP2qLqPa"; @@ -47,7 +48,7 @@ fn baseline_create_and_filter() -> anyhow::Result<()> { let baseline = dir.path().join("baseline.yaml"); // Create baseline with manage flag - Command::cargo_bin("kingfisher")? + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", dir.path().to_str().unwrap(), @@ -72,7 +73,7 @@ fn baseline_create_and_filter() -> anyhow::Result<()> { // Scanning with the baseline should suppress the existing finding and leave // the baseline untouched. - Command::cargo_bin("kingfisher")? + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", dir.path().to_str().unwrap(), @@ -94,7 +95,7 @@ fn baseline_create_and_filter() -> anyhow::Result<()> { assert_eq!(initial_baseline, baseline_after_scan, "baseline remains stable after reuse"); // Managing the baseline again should not churn entries or report the secret - Command::cargo_bin("kingfisher")? + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", dir.path().to_str().unwrap(), @@ -129,7 +130,7 @@ fn baseline_exclude_prunes_entries() -> anyhow::Result<()> { let baseline = dir.path().join("baseline.yaml"); // Initial baseline includes the .git secret - Command::cargo_bin("kingfisher")? + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", dir.path().to_str().unwrap(), @@ -150,7 +151,7 @@ fn baseline_exclude_prunes_entries() -> anyhow::Result<()> { assert!(content.contains(".git/secret.txt")); // Rescan with exclusion, which should prune the .git entry - Command::cargo_bin("kingfisher")? + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", dir.path().to_str().unwrap(), diff --git a/tests/smoke_branch.rs b/tests/smoke_branch.rs index 96b9d23..27f4d26 100644 --- a/tests/smoke_branch.rs +++ b/tests/smoke_branch.rs @@ -94,7 +94,7 @@ aws_secret_access_key = efnegoUp/WXc3XwlL77dXu1aKIICzvz+n+7Sz88i )?; // ── scan the repository by commit hash ─────────────────────────────────── - Command::cargo_bin("kingfisher")? + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", repo_dir.to_str().unwrap(), @@ -112,7 +112,7 @@ aws_secret_access_key = efnegoUp/WXc3XwlL77dXu1aKIICzvz+n+7Sz88i ); // ── scan only the diff between feature-1 and the merge base ───────────── - Command::cargo_bin("kingfisher")? + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", repo_dir.to_str().unwrap(), @@ -196,7 +196,7 @@ fn scan_specific_commit_reports_only_that_commit() -> Result<()> { let c1_hex = commits[0].to_string(); // first commit (AWS only) // Scan exactly the initial commit via --branch - Command::cargo_bin("kingfisher")? + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", repo_dir.to_str().unwrap(), @@ -225,7 +225,7 @@ fn scan_with_branch_root_includes_descendants() -> Result<()> { let c1_hex = commits[0].to_string(); // start from first commit // Using --branch-root should include the selected commit and remaining history up to HEAD - Command::cargo_bin("kingfisher")? + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", repo_dir.to_str().unwrap(), @@ -256,7 +256,7 @@ fn scan_branch_tip_with_branch_root_commit() -> Result<()> { // Passing --branch-root-commit should implicitly enable inclusive scanning even // without the legacy --branch-root flag when targeting a named branch tip. - Command::cargo_bin("kingfisher")? + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", repo_dir.to_str().unwrap(), diff --git a/tests/smoke_check_rules.rs b/tests/smoke_check_rules.rs index 7dff499..341b64d 100644 --- a/tests/smoke_check_rules.rs +++ b/tests/smoke_check_rules.rs @@ -7,7 +7,7 @@ use predicates::prelude::*; #[test] fn check_rules() -> anyhow::Result<()> { // ── run kingfisher ──────────────────────────────────────────────── - Command::cargo_bin("kingfisher")? + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "rules", "check", diff --git a/tests/smoke_docker.rs b/tests/smoke_docker.rs index 8392c4a..46e22c7 100644 --- a/tests/smoke_docker.rs +++ b/tests/smoke_docker.rs @@ -3,7 +3,7 @@ use std::process::Command; #[test] fn smoke_scan_docker_image() -> anyhow::Result<()> { - let mut cmd = Command::cargo_bin("kingfisher")?; + let mut cmd = Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")); let output = cmd .args([ "scan", diff --git a/tests/smoke_exclude.rs b/tests/smoke_exclude.rs index eb82ef8..504ecb5 100644 --- a/tests/smoke_exclude.rs +++ b/tests/smoke_exclude.rs @@ -14,7 +14,7 @@ fn exclude_pattern_hides_matches() -> anyhow::Result<()> { fs::write(&py, format!("token = \"{}\"\n", SECRET))?; fs::write(&txt, format!("token = \"{}\"\n", SECRET))?; - Command::cargo_bin("kingfisher")? + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", dir.path().to_str().unwrap(), @@ -41,7 +41,7 @@ fn exclude_git_directory_hides_matches() -> anyhow::Result<()> { fs::write(git_dir.join("config"), format!("token = \"{}\"\n", SECRET))?; fs::write(dir.path().join("bar.txt"), format!("token = \"{}\"\n", SECRET))?; - Command::cargo_bin("kingfisher")? + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", dir.path().to_str().unwrap(), diff --git a/tests/smoke_fs.rs b/tests/smoke_fs.rs index 1c1b35b..2e017fe 100644 --- a/tests/smoke_fs.rs +++ b/tests/smoke_fs.rs @@ -18,7 +18,7 @@ fn smoke_scan_filesystem_text_and_binary() -> anyhow::Result<()> { fs::write(&bin_path, [0x89, 0x50, 0x4E, 0x47])?; // tiny PNG header // ── run kingfisher ──────────────────────────────────────────────── - Command::cargo_bin("kingfisher")? + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", dir.path().to_str().unwrap(), diff --git a/tests/smoke_git.rs b/tests/smoke_git.rs index 428ce24..d1305fb 100644 --- a/tests/smoke_git.rs +++ b/tests/smoke_git.rs @@ -31,7 +31,7 @@ fn smoke_scan_git_history() -> anyhow::Result<()> { repo.commit(Some("HEAD"), &sig, &sig, "update", &tree2, &[&head])?; // ── run kingfisher with git-history mode FULL ───────────────────── - Command::cargo_bin("kingfisher")? + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args([ "scan", repo_dir.to_str().unwrap(), diff --git a/tests/smoke_github_homebrew.rs b/tests/smoke_github_homebrew.rs index e2eaa11..65b5527 100644 --- a/tests/smoke_github_homebrew.rs +++ b/tests/smoke_github_homebrew.rs @@ -3,7 +3,7 @@ use predicates::str::contains; #[test] fn scan_homebrew_github_no_findings() -> anyhow::Result<()> { - Command::cargo_bin("kingfisher")? + Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) .args(["scan", "--git-url", "https://github.com/homebrew/.github", "--no-update-check"]) .assert() .success() From dc02abac6368329e12792ea410f5527c7ea328a2 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Wed, 5 Nov 2025 17:19:11 -0800 Subject: [PATCH 06/19] Added an optional exclude_words list to PatternRequirements so matches containing case-insensitive placeholder words are filtered out, with accompanying tests to cover the new behavior. --- CHANGELOG.md | 2 +- README.md | 17 ++++ data/rules/aiven.yml | 2 +- data/rules/bitbucket.yml | 2 +- data/rules/confluent.yml | 4 +- data/rules/coze.yml | 7 +- data/rules/easypost.yml | 2 +- data/rules/generic.yml | 2 +- data/rules/intercom.yml | 2 +- data/rules/mongodb.yml | 2 +- data/rules/odbc.yml | 4 + data/rules/postgres.yml | 4 + data/rules/recaptcha.yml | 2 +- data/rules/sentry.yml | 2 +- data/rules/square.yml | 2 +- data/rules/twitch.yml | 3 +- docs/RULES.md | 16 +-- src/cli/commands/scan.rs | 4 + src/main.rs | 1 + src/matcher.rs | 128 ++++++++++++++++++++---- src/reporter.rs | 1 + src/reporter/json_format.rs | 1 + src/rules/rule.rs | 180 ++++++++++++++++++++++------------ src/scanner/enumerate.rs | 1 + src/scanner/repos.rs | 2 + tests/int_allowlist.rs | 1 + tests/int_bitbucket.rs | 1 + tests/int_dedup.rs | 1 + tests/int_github.rs | 1 + tests/int_gitlab.rs | 2 + tests/int_redact.rs | 1 + tests/int_slack.rs | 2 + tests/int_validation_cache.rs | 1 + tests/int_vulnerable_files.rs | 2 + tests/smoke_baseline.rs | 2 +- 35 files changed, 299 insertions(+), 108 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4337324..ecf43b1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,7 @@ All notable changes to this project will be documented in this file. ## [Unreleased] - Added `pattern_requirements` for rules. Enables post-regex character-class checks (digits, uppercase, lowercase, specials) to reduce false positives without lookarounds. Provides lightweight, in-memory validation after matches, keeping patterns fast and readable. -- Added an optional `exclude_words` list to `PatternRequirements` so matches containing case-insensitive placeholder words are filtered out, with accompanying tests to cover the new behavior. +- Added an optional `ignore_if_contains` list to `PatternRequirements` within the Rules structure, so matches containing case-insensitive placeholder words are filtered out, with accompanying tests to cover the new behavior. - Updated many rules with `pattern_requirements` - Automatically set `--no-dedup` whenever `--manage-baseline` is supplied so baseline management retains every occurrence of a finding diff --git a/README.md b/README.md index 5924c55..656e73c 100644 --- a/README.md +++ b/README.md @@ -323,6 +323,21 @@ However, you may want to add your own custom rules, or modify a detection to bet First, review [docs/RULES.md](/docs/RULES.md) to learn how to create custom Kingfisher rules. +### Pattern requirements and placeholder filtering + +Every rule can declare optional `pattern_requirements` to enforce additional character checks after a regex matches. Each field +is independent: + +- `min_digits`, `min_uppercase`, `min_lowercase`, and `min_special_chars` enforce complexity thresholds. +- `special_chars` lets you override the set of characters counted as "special" when `min_special_chars` is used. +- `ignore_if_contains` lists case-insensitive substrings that should cause a match to be discarded (for example, to drop + `test`, `demo`, or `localhost` values). Kingfisher still accepts the legacy `exclude_words` key as an alias when loading + existing rule files. + +When a match is skipped because of `ignore_if_contains`, Kingfisher logs the event at the `DEBUG` level alongside the rule that +was evaluated. If you need to keep those matches for a particular scan, pass `--no-ignore-if-contains` to `kingfisher scan` to +disable the substring filter without editing any rule files. + Once you've done that, you can provide your custom rules (defined in a YAML file) and provide it to Kingfisher at runtime --- no recompiling required! # 🎉 Usage @@ -1168,6 +1183,8 @@ leaves the default unchanged. - `--skip-aws-account-file `: Load AWS account numbers to skip from a file (one account per line; `#` comments allowed) - `--ignore-comment `: Honor additional inline directives from other scanners (repeatable; e.g. `--ignore-comment "gitleaks:allow"`) - `--no-ignore`: Disable inline directives entirely so every match is reported +- `--no-ignore-if-contains`: Ignore the `ignore_if_contains` filter in rules so placeholder words still produce findings + ## Understanding `--confidence` The `--confidence` flag sets a minimum confidence threshold, not an exact match. diff --git a/data/rules/aiven.yml b/data/rules/aiven.yml index de6fdae..0d1ac80 100644 --- a/data/rules/aiven.yml +++ b/data/rules/aiven.yml @@ -9,7 +9,7 @@ rules: ( [a-z0-9/+=]{372} ) - \b + (?:[^A-Za-z0-9/+=]|$) pattern_requirements: min_digits: 2 min_uppercase: 1 diff --git a/data/rules/bitbucket.yml b/data/rules/bitbucket.yml index 61370fa..37912de 100644 --- a/data/rules/bitbucket.yml +++ b/data/rules/bitbucket.yml @@ -51,7 +51,7 @@ rules: confidence: medium examples: - bitbucket_key=HedmnK9h6KD_eh9KK8FlI9ahUc8WfaNZ4gulbrtN2ouV - - bitbucket_secret=kd8j2h4jf9s8mf6l4k9j2h4jf9s8mf6l4k9j2h4jf9s8mf6l + - bitbucket_secret=kd8j2h4jf9s8mf6l4k9j2h4jf9s8mf6l4k9j2h4jf9s8 validation: type: Http content: diff --git a/data/rules/confluent.yml b/data/rules/confluent.yml index 96fefe2..86e7b13 100644 --- a/data/rules/confluent.yml +++ b/data/rules/confluent.yml @@ -32,12 +32,12 @@ rules: ( [A-Z0-9\+/]{64} ) - \b + (?:[^A-Za-z0-9/+=]|$) min_entropy: 3.3 confidence: medium examples: - confluent secret=abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890ab - - kafka_token=ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890abcdefghijklmnopqrstuvwxyzABCD + - kafka_token=ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890abcdefghijklmnopqrstuvwxyzAB references: - https://docs.confluent.io/cloud/current/api.html#tag/API-Keys-(iamv2)/operation/getIamV2ApiKey validation: diff --git a/data/rules/coze.yml b/data/rules/coze.yml index 982578b..c9783f3 100644 --- a/data/rules/coze.yml +++ b/data/rules/coze.yml @@ -3,7 +3,6 @@ rules: id: kingfisher.coze.1 pattern: | (?xi) - \b coze (?:.|[\n\r]){0,32}? \b @@ -37,6 +36,6 @@ rules: - https://www.coze.com/docs/developer_guides/coze_api_overview - https://www.coze.com/docs/developer_guides/retrieve_files examples: - - "pat_DlOG7fNcVfmw8cYhPWNcdfwrjjzwDr9EkV8EBjzHdgRWU2DzqHC1pPe0x590NN5f" - - "pat_93QiTdIvZGuRCFcfGTQJJ1VIYZ9dNHanX88wKoMojwMk3tX5tKqfFtxUp0ux8CjI" - - "pat_WvUTLYq5yZyaqegkyLSxXJMjXAJotjYEuC1sqT8daFlfwM3BiaRVJIZsER42DnhV" + - "key_coze = pat_DlOG7fNcVfmw8cYhPWNcdfwrjjzwDr9EkV8EBjzHdgRWU2DzqHC1pPe0x590NN5f" + - "coze_token = pat_93QiTdIvZGuRCFcfGTQJJ1VIYZ9dNHanX88wKoMojwMk3tX5tKqfFtxUp0ux8CjI" + - "coze-key: pat_WvUTLYq5yZyaqegkyLSxXJMjXAJotjYEuC1sqT8daFlfwM3BiaRVJIZsER42DnhV" diff --git a/data/rules/easypost.yml b/data/rules/easypost.yml index d5f1a42..0c85fb6 100644 --- a/data/rules/easypost.yml +++ b/data/rules/easypost.yml @@ -6,7 +6,7 @@ rules: \b ( EZ[AT]K - [A-Z0-9]{54} + [A-Za-z0-9]{54} ) \b pattern_requirements: diff --git a/data/rules/generic.yml b/data/rules/generic.yml index 38308f4..75d8fc5 100644 --- a/data/rules/generic.yml +++ b/data/rules/generic.yml @@ -10,7 +10,7 @@ rules: ) \b pattern_requirements: - min_digits: 4 + min_digits: 2 min_entropy: 3.3 confidence: low examples: diff --git a/data/rules/intercom.yml b/data/rules/intercom.yml index d6f159e..0b408ae 100644 --- a/data/rules/intercom.yml +++ b/data/rules/intercom.yml @@ -18,7 +18,7 @@ rules: examples: - "intercom_access_token: dG9rOvI0NmJlMTA5XzQwM2NfNDVlM184MjQzXzkwMDnmOTE1NGIyONoxOjA=" - - ic_token = "g1ZsclJXTjNfc1pBSzJDemE0eFVDU0U5c25CeDN4Vm9hQ2Zac0hXemZHNGVDPQ==" + - ic_token = "g1ZsclJXTjNfc1pBSzJDemE0eFVDU0U5c25CeDN4Vm9hQ2Zac0hXemZHNPQ==" references: - https://developers.intercom.com/docs/build-an-integration/learn-more/rest-apis diff --git a/data/rules/mongodb.yml b/data/rules/mongodb.yml index f6696c3..e70adec 100644 --- a/data/rules/mongodb.yml +++ b/data/rules/mongodb.yml @@ -82,7 +82,7 @@ rules: ) \b pattern_requirements: - exclude_words: + ignore_if_contains: - "@localhost" - "@127.0.0.1" min_entropy: 3 diff --git a/data/rules/odbc.yml b/data/rules/odbc.yml index 6593f92..6f60c6a 100644 --- a/data/rules/odbc.yml +++ b/data/rules/odbc.yml @@ -6,6 +6,10 @@ rules: (?: User | User\ Id | UserId | Uid) \s*=\s* ([^\s;]{3,100}) \s* ; [\ \t]* .{0,10} [\ \t]* (?: Password | Pwd) \s*=\s* ([^\t\ ;]{3,100}) \s* (?: [;] | $) + pattern_requirements: + ignore_if_contains: + - "localhost" + - "127.0.0.1" min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/postgres.yml b/data/rules/postgres.yml index 5cbb4ba..27b54ea 100644 --- a/data/rules/postgres.yml +++ b/data/rules/postgres.yml @@ -24,6 +24,10 @@ rules: (?: \d+ ) + pattern_requirements: + ignore_if_contains: + - "@localhost" + - "@127.0.0.1" min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/recaptcha.yml b/data/rules/recaptcha.yml index f13db51..d513a8d 100644 --- a/data/rules/recaptcha.yml +++ b/data/rules/recaptcha.yml @@ -9,7 +9,7 @@ rules: ( 6l[c-f][a-z0-9_-].{36} ) - \b + (?:[^A-Za-z0-9/]|$) pattern_requirements: min_digits: 3 min_entropy: 3 diff --git a/data/rules/sentry.yml b/data/rules/sentry.yml index 8d424ba..6e0d15e 100644 --- a/data/rules/sentry.yml +++ b/data/rules/sentry.yml @@ -50,7 +50,7 @@ rules: confidence: medium examples: - sntrys_eyJpYXQiOjE2OTA4ODAwMDAsInJlZ2lvbl91cmwiOiJodHRwczovL3NlbnRyeS5pby9vcmdzL215LW9yZy8ifQ==_abcdefghijklmnopqrstuvwx1234567890abcdefabc - - sntrys_eyJpYXQiOiIxNjkwODgwMDAwIiwicmVnaW9uX3VybCI6Imh0dHBzOi8vc2VudHJ5LmlvLyJ9_abcdABCD1234567890abcdABCD1234567890abcdABCD + - sntrys_eyJpYXQiOiIxNjkwODgwMDAwIiwicmVnaW9uX3VybCI6Imh0dHBzOi8vc2VudHJ5LmlvLyJ9_abcdABCD1234567890abcdABCD1234567890abcdABC references: - https://docs.sentry.io/api/auth/ validation: diff --git a/data/rules/square.yml b/data/rules/square.yml index c4aa62f..24a635a 100644 --- a/data/rules/square.yml +++ b/data/rules/square.yml @@ -16,7 +16,7 @@ rules: min_entropy: 3.3 confidence: medium examples: - - square EAAA7h9fL9zQJR8P0eAioAf9239345rDA2349bQ8edUA9FgA5JojdsF3A9f6nKLmn + - square EAAA7h9fL9zQJR8P0eAioAf9239345rDA2349bQ8edUA9FgA5JojdsF3A9f6nKLm - square EAAAvlYh9H7dZwC9ash2hrHjtlL5D2srERGK5OM6F2nvle23he3NzA60PAeFXNHj validation: type: Http diff --git a/data/rules/twitch.yml b/data/rules/twitch.yml index 1ab6102..d9ecc55 100644 --- a/data/rules/twitch.yml +++ b/data/rules/twitch.yml @@ -3,7 +3,6 @@ rules: id: kingfisher.twitch.1 pattern: | (?xi) - \b twitch (?:.|[\n\r]){0,32}? \b @@ -19,7 +18,7 @@ rules: confidence: medium examples: - TWITCH_TOKEN=abcdefghijklmnopqrstuvwx123456 - - "twitch_api_token: '0123456789abcdefghijklmnopqrstuv'" + - "twitch_api_token: '0123456789abcdefghijklmnopqrst'" references: - https://dev.twitch.tv/docs/authentication/validate-tokens/ validation: diff --git a/docs/RULES.md b/docs/RULES.md index 816dbe7..8a0d964 100644 --- a/docs/RULES.md +++ b/docs/RULES.md @@ -44,7 +44,7 @@ rules: min_lowercase: 1 # require at least 1 lowercase letter min_special_chars: 1 # require at least 1 special character special_chars: "!@#$%^&*()" # optional: custom special character set - exclude_words: # optional: drop matches containing these words + ignore_if_contains: # optional: drop matches containing these words - test validation: # (optional) live validation @@ -266,14 +266,16 @@ pattern_requirements: min_lowercase: 1 # Require at least 1 lowercase letter (a-z) min_special_chars: 1 # Require at least 1 special character special_chars: "!@#$%^&*" # Optional: define which characters are "special" - exclude_words: # Optional: reject matches containing any of these (case-insensitive) + ignore_if_contains: # Optional: reject matches containing any of these (case-insensitive) - test - demo ``` All fields are optional. If `special_chars` is not specified, the default set includes: `!@#$%^&*()_+-=[]{}|;:'",.<>?/\`~` -`exclude_words` performs a case-insensitive substring check. If any entry (after trimming whitespace) appears within the match, the match is discarded. This is helpful for dropping known dummy tokens such as "test" or "demo" that otherwise satisfy the regex. +`ignore_if_contains` performs a case-insensitive substring check. If any entry (after trimming whitespace) appears within the match, the match is discarded. This is helpful for dropping known dummy tokens such as "test" or "demo" that otherwise satisfy the regex. + +When this filter removes a match it is logged at the `DEBUG` level so you can see exactly which substring caused the skip. If you need to keep every match even when one of these substrings appears, pass `--no-ignore-if-contains` to `kingfisher scan`. The flag disables this post-processing step without changing the rule definitions. ### Example: Secure API Key @@ -295,7 +297,7 @@ rules: min_uppercase: 1 # Must contain at least 1 uppercase letter min_lowercase: 1 # Must contain at least 1 lowercase letter min_special_chars: 1 # Must contain at least 1 special character - exclude_words: + ignore_if_contains: - test examples: - api_key = "MyS3cur3K3y!2024" @@ -307,7 +309,7 @@ In this example: - The `pattern_requirements` filters out matches that don't have at least one of each required type - A match like `"abcdefghijklmnopqrst"` would be rejected (no uppercase, no digit, no special) - A match like `"Abc123!SecureToken"` would be accepted (has all required types) -- A match like `"Test123!SecureToken"` would be rejected because it contains the excluded word `test` +- A match like `"Test123!SecureToken"` would be rejected because it contains the `ignore_if_contains` term `test` ### Example: Excluding Dummy Values @@ -318,13 +320,13 @@ rules: pattern: |- (?i)token[:=]\s*([A-Za-z0-9]{12,}) pattern_requirements: - exclude_words: + ignore_if_contains: - placeholder - sample examples: - token: "REALVALUE1234" negative_examples: - - token = "SAMPLETOKEN9999" # dropped by exclude_words + - token = "SAMPLETOKEN9999" # dropped by ignore_if_contains ``` ### Example: Custom Special Characters diff --git a/src/cli/commands/scan.rs b/src/cli/commands/scan.rs index 1f0aaf5..f30d028 100644 --- a/src/cli/commands/scan.rs +++ b/src/cli/commands/scan.rs @@ -144,6 +144,10 @@ pub struct ScanArgs { /// Disable inline ignore directives entirely #[arg(long = "no-ignore", default_value_t = false)] pub no_inline_ignore: bool, + + /// Disable rule-level `ignore_if_contains` filtering for pattern requirements + #[arg(long = "no-ignore-if-contains", default_value_t = false)] + pub no_ignore_if_contains: bool, } /// Confidence levels for findings diff --git a/src/main.rs b/src/main.rs index a7deda6..b52fcd3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -446,6 +446,7 @@ fn create_default_scan_args() -> cli::commands::scan::ScanArgs { output_args: OutputArgs { output: None, format: ReportOutputFormat::Pretty }, no_base64: false, no_inline_ignore: false, + no_ignore_if_contains: false, } } /// Run the rules check command diff --git a/src/matcher.rs b/src/matcher.rs index e572c8b..fb39ab4 100644 --- a/src/matcher.rs +++ b/src/matcher.rs @@ -29,7 +29,7 @@ use crate::{ parser, parser::{Checker, Language}, rule_profiling::{ConcurrentRuleProfiler, RuleStats, RuleTimer}, - rules::rule::Rule, + rules::rule::{PatternValidationResult, Rule}, rules_database::RulesDatabase, safe_list::{is_safe_match, is_user_match}, scanner_pool::ScannerPool, @@ -203,6 +203,9 @@ pub struct Matcher<'a> { /// Configuration that controls inline ignore directives inline_ignore_config: InlineIgnoreConfig, + + /// Whether matches should honour `ignore_if_contains` requirements. + respect_ignore_if_contains: bool, } /// This `Drop` implementation updates the `global_stats` with the local stats impl<'a> Drop for Matcher<'a> { @@ -232,6 +235,7 @@ impl<'a> Matcher<'a> { shared_profiler: Option>, extra_ignore_directives: &[String], disable_inline_ignores: bool, + respect_ignore_if_contains: bool, ) -> Result { // Changed: removed `with_capacity(16384)` so we don't pre-allocate a large Vec let raw_matches_scratch = Vec::new(); @@ -258,6 +262,7 @@ impl<'a> Matcher<'a> { } else { InlineIgnoreConfig::new(extra_ignore_directives) }, + respect_ignore_if_contains, }) } @@ -414,6 +419,7 @@ impl<'a> Matcher<'a> { redact, &filename, self.profiler.as_ref(), + self.respect_ignore_if_contains, &self.inline_ignore_config, ); } @@ -439,6 +445,7 @@ impl<'a> Matcher<'a> { redact, &filename, self.profiler.as_ref(), + self.respect_ignore_if_contains, &self.inline_ignore_config, ); } @@ -470,6 +477,7 @@ impl<'a> Matcher<'a> { redact, &filename, self.profiler.as_ref(), + self.respect_ignore_if_contains, &self.inline_ignore_config, ); } @@ -574,6 +582,7 @@ fn filter_match<'b>( redact: bool, filename: &str, profiler: Option<&Arc>, + respect_ignore_if_contains: bool, inline_ignore_config: &InlineIgnoreConfig, ) { let mut timer = @@ -605,12 +614,22 @@ fn filter_match<'b>( // Check character requirements if specified if let Some(char_reqs) = rule.pattern_requirements() { - if !char_reqs.validate(mi_bytes) { - debug!( - "Skipping match that does not meet character requirements for rule {}", - rule.id() - ); - continue; + match char_reqs.validate(mi_bytes, respect_ignore_if_contains) { + PatternValidationResult::Passed => {} + PatternValidationResult::Failed => { + debug!( + "Skipping match that does not meet character requirements for rule {}", + rule.id() + ); + continue; + } + PatternValidationResult::IgnoredBySubstring { matched_term } => { + debug!( + "Skipping match for rule {} because it contains ignored term {matched_term}", + rule.id() + ); + continue; + } } } @@ -1056,6 +1075,7 @@ mod test { None, &[], false, + true, ) .unwrap(); @@ -1131,6 +1151,7 @@ mod test { None, // Pass the shared profiler &[], false, + true, )?; matcher.scan_bytes_raw(input.as_bytes(), "fname")?; assert_eq!( @@ -1141,7 +1162,7 @@ mod test { } #[test] - fn test_pattern_requirements_exclude_words_filters_matches() -> Result<()> { + fn test_pattern_requirements_ignore_if_contains_filters_matches() -> Result<()> { let rules = vec![Rule::new(RuleSyntax { id: "test.exclude".to_string(), name: "exclude words".to_string(), @@ -1160,7 +1181,7 @@ mod test { min_lowercase: None, min_special_chars: None, special_chars: None, - exclude_words: Some(vec!["TEST".to_string()]), + ignore_if_contains: Some(vec!["TEST".to_string()]), }), })]; @@ -1168,8 +1189,17 @@ mod test { let input = b"prefixgood prefixtest"; let seen_blobs: BlobIdMap = BlobIdMap::new(); let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); - let mut matcher = - Matcher::new(&rules_db, scanner_pool, &seen_blobs, None, false, None, &[], false)?; + let mut matcher = Matcher::new( + &rules_db, + scanner_pool, + &seen_blobs, + None, + false, + None, + &[], + false, + true, + )?; let blob = Blob::from_bytes(input.to_vec()); let origin = OriginSet::from(Origin::from_file(PathBuf::from("exclude.txt"))); @@ -1184,16 +1214,75 @@ mod test { } }; - assert_eq!(matches.len(), 1, "exclude_words should drop filtered matches"); + assert_eq!(matches.len(), 1, "ignore_if_contains should drop filtered matches"); assert_eq!( - matches[0].matching_input, - b"prefixgood", + matches[0].matching_input, b"prefixgood", "remaining match should be the non-excluded token", ); Ok(()) } + #[test] + fn test_pattern_requirements_ignore_if_contains_can_be_disabled_in_matcher() -> Result<()> { + let rules = vec![Rule::new(RuleSyntax { + id: "test.exclude".to_string(), + name: "exclude words".to_string(), + pattern: "(?Pprefix[A-Za-z]+)".to_string(), + confidence: crate::rules::rule::Confidence::Medium, + min_entropy: 0.0, + visible: true, + examples: vec![], + negative_examples: vec![], + references: vec![], + validation: None, + depends_on_rule: vec![], + pattern_requirements: Some(PatternRequirements { + min_digits: None, + min_uppercase: None, + min_lowercase: None, + min_special_chars: None, + special_chars: None, + ignore_if_contains: Some(vec!["TEST".to_string()]), + }), + })]; + + let rules_db = RulesDatabase::from_rules(rules)?; + let input = b"prefixgood prefixtest"; + let seen_blobs: BlobIdMap = BlobIdMap::new(); + let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); + let mut matcher = Matcher::new( + &rules_db, + scanner_pool, + &seen_blobs, + None, + false, + None, + &[], + false, + false, + )?; + + let blob = Blob::from_bytes(input.to_vec()); + let origin = OriginSet::from(Origin::from_file(PathBuf::from("exclude-disabled.txt"))); + + let matches = match matcher.scan_blob(&blob, &origin, None, false, false, false)? { + ScanResult::New(matches) => matches, + ScanResult::SeenWithMatches => { + panic!( + "unexpected scan result: blob should not be considered previously seen with matches" + ) + } + ScanResult::SeenSansMatches => { + panic!( + "unexpected scan result: blob should not be considered previously seen without matches" + ) + } + }; + + assert_eq!(matches.len(), 2, "disabling ignore_if_contains should keep all matches"); + Ok(()) + } // --------------------------------------------------------------------- // additional deterministic unit-tests @@ -1274,7 +1363,8 @@ mod test { let rules_db = RulesDatabase::from_rules(vec![rule])?; let seen = BlobIdMap::new(); let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); - let mut m = Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false)?; + let mut m = + Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false, true)?; let buf = b"dup dup"; // two literal hits, same rule @@ -1312,7 +1402,7 @@ mod test { let seen = BlobIdMap::new(); let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); let mut matcher = - Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false)?; + Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false, true)?; let blob = Blob::from_bytes(b"let key = \"secret_token\" # kingfisher:ignore".to_vec()); let origin = OriginSet::from(Origin::from_file(PathBuf::from("inline.txt"))); @@ -1345,7 +1435,7 @@ mod test { let seen = BlobIdMap::new(); let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); let mut matcher = - Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false)?; + Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false, true)?; let blob = Blob::from_bytes( br#"let data = """ @@ -1390,7 +1480,7 @@ line2 let seen = BlobIdMap::new(); let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); let mut matcher = - Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false)?; + Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &[], false, true)?; let matches_without_compat = match matcher.scan_blob(&blob, &origin, None, false, false, false)? { ScanResult::New(matches) => matches.len(), @@ -1402,7 +1492,7 @@ line2 let scanner_pool = Arc::new(ScannerPool::new(Arc::new(rules_db.vsdb.clone()))); let extra = vec![String::from("gitleaks:allow")]; let mut matcher = - Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &extra, false)?; + Matcher::new(&rules_db, scanner_pool, &seen, None, false, None, &extra, false, true)?; match matcher.scan_blob(&blob, &origin, None, false, false, false)? { ScanResult::New(matches) => assert!(matches.is_empty()), _ => panic!("unexpected scan result"), diff --git a/src/reporter.rs b/src/reporter.rs index bb96064..e8e29ba 100644 --- a/src/reporter.rs +++ b/src/reporter.rs @@ -807,6 +807,7 @@ mod tests { skip_aws_account: Vec::new(), skip_aws_account_file: None, no_inline_ignore: false, + no_ignore_if_contains: false, } } diff --git a/src/reporter/json_format.rs b/src/reporter/json_format.rs index 11c5b30..f9fe2bf 100644 --- a/src/reporter/json_format.rs +++ b/src/reporter/json_format.rs @@ -180,6 +180,7 @@ mod tests { skip_aws_account_file: None, no_base64: false, no_inline_ignore: false, + no_ignore_if_contains: false, } } diff --git a/src/rules/rule.rs b/src/rules/rule.rs index 051adc3..f1f9615 100644 --- a/src/rules/rule.rs +++ b/src/rules/rule.rs @@ -72,7 +72,7 @@ pub struct PatternRequirements { pub special_chars: Option, /// Words that should cause the match to be excluded when present (case-insensitive) #[serde(default)] - pub exclude_words: Option>, + pub ignore_if_contains: Option>, } impl PatternRequirements { @@ -80,8 +80,13 @@ impl PatternRequirements { const DEFAULT_SPECIAL_CHARS: &'static str = "!@#$%^&*()_+-=[]{}|;:'\",.<>?/\\`~"; /// Validates whether the given byte slice meets the character requirements. - /// Returns true if all requirements are met, false otherwise. - pub fn validate(&self, input: &[u8]) -> bool { + /// Returns the validation outcome, including whether the match should be ignored + /// due to `ignore_if_contains` entries when that behaviour is enabled. + pub fn validate( + &self, + input: &[u8], + respect_ignore_if_contains: bool, + ) -> PatternValidationResult { // Convert to string (lossy for non-UTF8) let s = String::from_utf8_lossy(input); @@ -89,7 +94,7 @@ impl PatternRequirements { if let Some(min_digits) = self.min_digits { let digit_count = s.chars().filter(|c| c.is_ascii_digit()).count(); if digit_count < min_digits { - return false; + return PatternValidationResult::Failed; } } @@ -97,7 +102,7 @@ impl PatternRequirements { if let Some(min_uppercase) = self.min_uppercase { let uppercase_count = s.chars().filter(|c| c.is_ascii_uppercase()).count(); if uppercase_count < min_uppercase { - return false; + return PatternValidationResult::Failed; } } @@ -105,7 +110,7 @@ impl PatternRequirements { if let Some(min_lowercase) = self.min_lowercase { let lowercase_count = s.chars().filter(|c| c.is_ascii_lowercase()).count(); if lowercase_count < min_lowercase { - return false; + return PatternValidationResult::Failed; } } @@ -115,33 +120,52 @@ impl PatternRequirements { self.special_chars.as_deref().unwrap_or(Self::DEFAULT_SPECIAL_CHARS); let special_count = s.chars().filter(|c| special_chars.contains(*c)).count(); if special_count < min_special { - return false; + return PatternValidationResult::Failed; } } - // Check exclude words requirement - if let Some(exclude_words) = self.exclude_words.as_ref() { - let lowercase_input = s.to_lowercase(); - if exclude_words - .iter() - .filter_map(|word| { - let trimmed = word.trim(); - if trimmed.is_empty() { - None - } else { - Some(trimmed.to_lowercase()) - } - }) - .any(|word| lowercase_input.contains(&word)) - { - return false; + // Check ignore-if-contains requirement + if respect_ignore_if_contains { + if let Some(ignore_terms) = self.ignore_if_contains.as_ref() { + let lowercase_input = s.to_lowercase(); + if let Some(matched_term) = ignore_terms + .iter() + .filter_map(|term| { + let trimmed = term.trim(); + if trimmed.is_empty() { + None + } else { + Some((trimmed, trimmed.to_lowercase())) + } + }) + .find_map(|(original, lowered)| { + if lowercase_input.contains(&lowered) { + Some(original.to_string()) + } else { + None + } + }) + { + return PatternValidationResult::IgnoredBySubstring { matched_term }; + } } } - true + PatternValidationResult::Passed } } +/// Result of validating [`PatternRequirements`] against a potential match. +#[derive(Debug, PartialEq, Eq)] +pub enum PatternValidationResult { + /// All requirements are satisfied and the match should be kept. + Passed, + /// Requirements were not satisfied. + Failed, + /// The match contains one of the `ignore_if_contains` substrings and should be skipped. + IgnoredBySubstring { matched_term: String }, +} + /// Configuration for HTTP validation. This contains a request configuration /// and an optional multipart configuration. #[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord, Hash, Clone)] @@ -549,17 +573,17 @@ mod tests { min_lowercase: None, min_special_chars: None, special_chars: None, - exclude_words: None, + ignore_if_contains: None, }; // Should pass: has 3 digits - assert!(reqs.validate(b"abc123def")); + assert!(matches!(reqs.validate(b"abc123def", true), PatternValidationResult::Passed)); // Should fail: only 1 digit - assert!(!reqs.validate(b"abc1def")); + assert!(matches!(reqs.validate(b"abc1def", true), PatternValidationResult::Failed)); // Should fail: no digits - assert!(!reqs.validate(b"abcdef")); + assert!(matches!(reqs.validate(b"abcdef", true), PatternValidationResult::Failed)); } #[test] @@ -570,17 +594,17 @@ mod tests { min_lowercase: None, min_special_chars: None, special_chars: None, - exclude_words: None, + ignore_if_contains: None, }; // Should pass: has 3 uppercase - assert!(reqs.validate(b"ABCdef")); + assert!(matches!(reqs.validate(b"ABCdef", true), PatternValidationResult::Passed)); // Should fail: only 1 uppercase - assert!(!reqs.validate(b"Adef")); + assert!(matches!(reqs.validate(b"Adef", true), PatternValidationResult::Failed)); // Should fail: no uppercase - assert!(!reqs.validate(b"abcdef")); + assert!(matches!(reqs.validate(b"abcdef", true), PatternValidationResult::Failed)); } #[test] @@ -591,17 +615,17 @@ mod tests { min_lowercase: Some(2), min_special_chars: None, special_chars: None, - exclude_words: None, + ignore_if_contains: None, }; // Should pass: has 3 lowercase - assert!(reqs.validate(b"ABCdef")); + assert!(matches!(reqs.validate(b"ABCdef", true), PatternValidationResult::Passed)); // Should fail: only 1 lowercase - assert!(!reqs.validate(b"ABCd")); + assert!(matches!(reqs.validate(b"ABCd", true), PatternValidationResult::Failed)); // Should fail: no lowercase - assert!(!reqs.validate(b"ABC123")); + assert!(matches!(reqs.validate(b"ABC123", true), PatternValidationResult::Failed)); } #[test] @@ -612,17 +636,17 @@ mod tests { min_lowercase: None, min_special_chars: Some(2), special_chars: None, // uses default - exclude_words: None, + ignore_if_contains: None, }; // Should pass: has 2 special chars - assert!(reqs.validate(b"abc!@def")); + assert!(matches!(reqs.validate(b"abc!@def", true), PatternValidationResult::Passed)); // Should fail: only 1 special char - assert!(!reqs.validate(b"abc!def")); + assert!(matches!(reqs.validate(b"abc!def", true), PatternValidationResult::Failed)); // Should fail: no special chars - assert!(!reqs.validate(b"abcdef")); + assert!(matches!(reqs.validate(b"abcdef", true), PatternValidationResult::Failed)); } #[test] @@ -633,17 +657,17 @@ mod tests { min_lowercase: None, min_special_chars: Some(2), special_chars: Some("$%^".to_string()), - exclude_words: None, + ignore_if_contains: None, }; // Should pass: has 2 custom special chars - assert!(reqs.validate(b"abc$%def")); + assert!(matches!(reqs.validate(b"abc$%def", true), PatternValidationResult::Passed)); // Should fail: has special chars but not the custom ones - assert!(!reqs.validate(b"abc!@def")); + assert!(matches!(reqs.validate(b"abc!@def", true), PatternValidationResult::Failed)); // Should fail: only 1 custom special char - assert!(!reqs.validate(b"abc$def")); + assert!(matches!(reqs.validate(b"abc$def", true), PatternValidationResult::Failed)); } #[test] @@ -654,60 +678,90 @@ mod tests { min_lowercase: Some(1), min_special_chars: Some(1), special_chars: None, - exclude_words: None, + ignore_if_contains: None, }; // Should pass: has all requirements - assert!(reqs.validate(b"Abc1!")); + assert!(matches!(reqs.validate(b"Abc1!", true), PatternValidationResult::Passed)); // Should fail: missing digit - assert!(!reqs.validate(b"Abc!")); + assert!(matches!(reqs.validate(b"Abc!", true), PatternValidationResult::Failed)); // Should fail: missing uppercase - assert!(!reqs.validate(b"abc1!")); + assert!(matches!(reqs.validate(b"abc1!", true), PatternValidationResult::Failed)); // Should fail: missing lowercase - assert!(!reqs.validate(b"ABC1!")); + assert!(matches!(reqs.validate(b"ABC1!", true), PatternValidationResult::Failed)); // Should fail: missing special - assert!(!reqs.validate(b"Abc1")); + assert!(matches!(reqs.validate(b"Abc1", true), PatternValidationResult::Failed)); } #[test] - fn test_pattern_requirements_exclude_words() { + fn test_pattern_requirements_ignore_if_contains() { let reqs = PatternRequirements { min_digits: None, min_uppercase: None, min_lowercase: None, min_special_chars: None, special_chars: None, - exclude_words: Some(vec!["test".to_string(), "Demo".to_string()]), + ignore_if_contains: Some(vec!["test".to_string(), "Demo".to_string()]), }; // Should fail: contains "test" (case-insensitive) - assert!(!reqs.validate(b"MyTestToken")); + assert!(matches!( + reqs.validate(b"MyTestToken", true), + PatternValidationResult::IgnoredBySubstring { .. } + )); // Should fail: contains "demo" (case-insensitive) - assert!(!reqs.validate(b"example-demo-value")); + assert!(matches!( + reqs.validate(b"example-demo-value", true), + PatternValidationResult::IgnoredBySubstring { .. } + )); // Should pass: does not contain excluded words - assert!(reqs.validate(b"example-value")); + assert!(matches!(reqs.validate(b"example-value", true), PatternValidationResult::Passed)); } #[test] - fn test_pattern_requirements_exclude_words_ignores_empty_entries() { + fn test_pattern_requirements_ignore_if_contains_ignores_empty_entries() { let reqs = PatternRequirements { min_digits: None, min_uppercase: None, min_lowercase: None, min_special_chars: None, special_chars: None, - exclude_words: Some(vec![" ".to_string(), "".to_string(), "BLOCK".to_string()]), + ignore_if_contains: Some(vec![" ".to_string(), "".to_string(), "BLOCK".to_string()]), }; // Should fail only when non-empty exclusion matches - assert!(!reqs.validate(b"needs-blocking")); - assert!(reqs.validate(b"allowed")); + assert!(matches!( + reqs.validate(b"needs-blocking", true), + PatternValidationResult::IgnoredBySubstring { .. } + )); + assert!(matches!(reqs.validate(b"allowed", true), PatternValidationResult::Passed)); + } + + #[test] + fn test_pattern_requirements_ignore_if_contains_can_be_disabled() { + let reqs = PatternRequirements { + min_digits: None, + min_uppercase: None, + min_lowercase: None, + min_special_chars: None, + special_chars: None, + ignore_if_contains: Some(vec!["ignoreme".to_string()]), + }; + + // With ignoring enabled, the match is skipped + assert!(matches!( + reqs.validate(b"value-ignoreme", true), + PatternValidationResult::IgnoredBySubstring { .. } + )); + + // With ignoring disabled, the same input passes requirements + assert!(matches!(reqs.validate(b"value-ignoreme", false), PatternValidationResult::Passed)); } #[test] @@ -718,12 +772,12 @@ mod tests { min_lowercase: None, min_special_chars: None, special_chars: None, - exclude_words: None, + ignore_if_contains: None, }; // Should pass: no requirements - assert!(reqs.validate(b"anything")); - assert!(reqs.validate(b"123")); - assert!(reqs.validate(b"!@#")); + assert!(matches!(reqs.validate(b"anything", true), PatternValidationResult::Passed)); + assert!(matches!(reqs.validate(b"123", true), PatternValidationResult::Passed)); + assert!(matches!(reqs.validate(b"!@#", true), PatternValidationResult::Passed)); } } diff --git a/src/scanner/enumerate.rs b/src/scanner/enumerate.rs index 06cc3c9..1a354ae 100644 --- a/src/scanner/enumerate.rs +++ b/src/scanner/enumerate.rs @@ -169,6 +169,7 @@ pub fn enumerate_filesystem_inputs( Some(shared_profiler), &args.extra_ignore_comments, args.no_inline_ignore, + !args.no_ignore_if_contains, )?; let blob_processor_init_time = Mutex::new(t1.elapsed()); let make_blob_processor = || -> BlobProcessor { diff --git a/src/scanner/repos.rs b/src/scanner/repos.rs index f041049..331f610 100644 --- a/src/scanner/repos.rs +++ b/src/scanner/repos.rs @@ -683,6 +683,7 @@ pub async fn fetch_s3_objects( Some(shared_profiler.clone()), &args.extra_ignore_comments, args.no_inline_ignore, + !args.no_ignore_if_contains, )?; let mut processor = BlobProcessor { matcher }; @@ -764,6 +765,7 @@ pub async fn fetch_gcs_objects( Some(shared_profiler.clone()), &args.extra_ignore_comments, args.no_inline_ignore, + !args.no_ignore_if_contains, )?; let mut processor = BlobProcessor { matcher }; diff --git a/tests/int_allowlist.rs b/tests/int_allowlist.rs index d950c47..dcd9067 100644 --- a/tests/int_allowlist.rs +++ b/tests/int_allowlist.rs @@ -148,6 +148,7 @@ fn run_skiplist(skip_regex: Vec, skip_skipword: Vec) -> Result Result<()> { no_base64: false, extra_ignore_comments: Vec::new(), no_inline_ignore: false, + no_ignore_if_contains: false, }; let global_args = GlobalArgs { diff --git a/tests/int_dedup.rs b/tests/int_dedup.rs index b6eaae9..d059e73 100644 --- a/tests/int_dedup.rs +++ b/tests/int_dedup.rs @@ -168,6 +168,7 @@ rules: no_base64: false, extra_ignore_comments: Vec::new(), no_inline_ignore: false, + no_ignore_if_contains: false, }; let global_args = GlobalArgs { diff --git a/tests/int_github.rs b/tests/int_github.rs index f96dd16..91af8db 100644 --- a/tests/int_github.rs +++ b/tests/int_github.rs @@ -155,6 +155,7 @@ fn test_github_remote_scan() -> Result<()> { no_base64: false, extra_ignore_comments: Vec::new(), no_inline_ignore: false, + no_ignore_if_contains: false, }; // Create global arguments let global_args = GlobalArgs { diff --git a/tests/int_gitlab.rs b/tests/int_gitlab.rs index 71421d1..6651fa7 100644 --- a/tests/int_gitlab.rs +++ b/tests/int_gitlab.rs @@ -153,6 +153,7 @@ fn test_gitlab_remote_scan() -> Result<()> { skip_aws_account_file: None, no_base64: false, no_inline_ignore: false, + no_ignore_if_contains: false, }; let global_args = GlobalArgs { @@ -304,6 +305,7 @@ fn test_gitlab_remote_scan_no_history() -> Result<()> { no_base64: false, extra_ignore_comments: Vec::new(), no_inline_ignore: false, + no_ignore_if_contains: false, }; let global_args = GlobalArgs { diff --git a/tests/int_redact.rs b/tests/int_redact.rs index 86b9dd1..b9fb1f8 100644 --- a/tests/int_redact.rs +++ b/tests/int_redact.rs @@ -131,6 +131,7 @@ async fn test_redact_hashes_finding_values() -> Result<()> { no_base64: false, extra_ignore_comments: Vec::new(), no_inline_ignore: false, + no_ignore_if_contains: false, }; let global_args = GlobalArgs { diff --git a/tests/int_slack.rs b/tests/int_slack.rs index d2d9048..27c5999 100644 --- a/tests/int_slack.rs +++ b/tests/int_slack.rs @@ -139,6 +139,7 @@ impl TestContext { skip_aws_account_file: None, no_base64: false, no_inline_ignore: false, + no_ignore_if_contains: false, }; let loaded = RuleLoader::from_rule_specifiers(&scan_args.rules).load(&scan_args)?; @@ -278,6 +279,7 @@ async fn test_scan_slack_messages() -> Result<()> { no_base64: false, extra_ignore_comments: Vec::new(), no_inline_ignore: false, + no_ignore_if_contains: false, }; let global_args = GlobalArgs { diff --git a/tests/int_validation_cache.rs b/tests/int_validation_cache.rs index d868936..970ad0a 100644 --- a/tests/int_validation_cache.rs +++ b/tests/int_validation_cache.rs @@ -211,6 +211,7 @@ async fn test_validation_cache_and_depvars() -> Result<()> { no_base64: false, extra_ignore_comments: Vec::new(), no_inline_ignore: false, + no_ignore_if_contains: false, }; /* --------------------------------------------------------- * diff --git a/tests/int_vulnerable_files.rs b/tests/int_vulnerable_files.rs index 9a17797..3e8bc4a 100644 --- a/tests/int_vulnerable_files.rs +++ b/tests/int_vulnerable_files.rs @@ -154,6 +154,7 @@ impl TestContext { no_base64: false, extra_ignore_comments: Vec::new(), no_inline_ignore: false, + no_ignore_if_contains: false, }; let loaded = RuleLoader::from_rule_specifiers(&scan_args.rules) @@ -281,6 +282,7 @@ impl TestContext { skip_aws_account_file: None, no_base64: false, no_inline_ignore: false, + no_ignore_if_contains: false, }; let global_args = GlobalArgs { diff --git a/tests/smoke_baseline.rs b/tests/smoke_baseline.rs index 5c36797..8f9a8f2 100644 --- a/tests/smoke_baseline.rs +++ b/tests/smoke_baseline.rs @@ -1,9 +1,9 @@ use std::fs; use assert_cmd::Command; +use clap::Parser; use predicates::prelude::*; use tempfile::tempdir; -use clap::Parser; const GH_PAT: &str = "ghp_1wuHFikBKQtCcH3EB2FBUkyn8krXhP2qLqPa"; From c9555422b4854cb2b26cd12db0008bda84188ed8 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Wed, 5 Nov 2025 18:30:21 -0800 Subject: [PATCH 07/19] updated ci --- .github/workflows/release.yml | 2 +- CHANGELOG.md | 11 ++++++----- Cargo.toml | 2 +- data/rules/stripe.yml | 4 ---- 4 files changed, 8 insertions(+), 11 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 4debe1d..f2940cf 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -119,7 +119,7 @@ jobs: macos-x64: name: macOS x64 - runs-on: macos-13 + runs-on: macos-15-intel steps: - uses: actions/checkout@v4 diff --git a/CHANGELOG.md b/CHANGELOG.md index ecf43b1..884beb6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,11 +2,12 @@ All notable changes to this project will be documented in this file. -## [Unreleased] -- Added `pattern_requirements` for rules. Enables post-regex character-class checks (digits, uppercase, lowercase, specials) to reduce false positives without lookarounds. Provides lightweight, in-memory validation after matches, keeping patterns fast and readable. -- Added an optional `ignore_if_contains` list to `PatternRequirements` within the Rules structure, so matches containing case-insensitive placeholder words are filtered out, with accompanying tests to cover the new behavior. -- Updated many rules with `pattern_requirements` -- Automatically set `--no-dedup` whenever `--manage-baseline` is supplied so baseline management retains every occurrence of a finding +## [v1.62.0] +- Added `pattern_requirements` checks to rules, providing lightweight post-regex character-class validation without lookarounds. +- Added an `ignore_if_contains` option to `PatternRequirements` to drop matches containing case-insensitive placeholder words, with tests covering the new behavior. +- Updated rules to adopt the new `pattern_requirements` support. +- Automatically enable `--no-dedup` when `--manage-baseline` is supplied so baseline management keeps every finding. + ## [v1.61.0] - Fixed local filesystem scans to keep `open_path_as_is` enabled when opening Git repositories and only disable it for diff-based scans. diff --git a/Cargo.toml b/Cargo.toml index d85f76f..abb87db 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,7 @@ publish = false [package] name = "kingfisher" -version = "1.61.0" +version = "1.62.0" description = "MongoDB's blazingly fast and accurate secret scanning and validation tool" edition.workspace = true rust-version.workspace = true diff --git a/data/rules/stripe.yml b/data/rules/stripe.yml index 934dba7..9d48341 100644 --- a/data/rules/stripe.yml +++ b/data/rules/stripe.yml @@ -8,12 +8,10 @@ rules: (?:.|[\n\r]){0,16}? (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN) (?:.|[\n\r]){0,16}? - \b ( pk_live_ (?:[0-9A-Z]{6}){4,30} ) - \b pattern_requirements: min_digits: 2 min_uppercase: 1 @@ -33,14 +31,12 @@ rules: (?:.|[\n\r]){0,16}? (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN) (?:.|[\n\r]){0,16}? - \b ( (?: sk|rk )_live_ (?:[0-9A-Z]{8}){3,25} ) - \b pattern_requirements: min_digits: 2 min_entropy: 3.3 From 563c1a4be047ed684ad96790236b6fd1a28b73c3 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Wed, 5 Nov 2025 22:35:29 -0800 Subject: [PATCH 08/19] updated ci --- data/rules/adobe.yml | 2 +- data/rules/aiven.yml | 2 +- data/rules/aws.yml | 15 ++++++++------- data/rules/azurestorage.yml | 2 +- data/rules/confluent.yml | 2 +- data/rules/github.yml | 14 ++++++++------ data/rules/google.yml | 2 +- data/rules/ibm.yml | 3 +-- data/rules/jwt.yml | 2 +- data/rules/mapbox.yml | 6 +++--- data/rules/openweathermap.yml | 5 +---- data/rules/pypi.yml | 2 +- data/rules/recaptcha.yml | 2 +- data/rules/sentry.yml | 4 +++- 14 files changed, 32 insertions(+), 31 deletions(-) diff --git a/data/rules/adobe.yml b/data/rules/adobe.yml index 09ec783..3eb79a2 100644 --- a/data/rules/adobe.yml +++ b/data/rules/adobe.yml @@ -62,7 +62,7 @@ rules: ( p8e-[A-Z0-9-]{32} ) - (?:[^A-Z0-9-]|$) + (?:[^A-Z0-9-]) min_entropy: 3.5 examples: - | diff --git a/data/rules/aiven.yml b/data/rules/aiven.yml index 0d1ac80..0b98079 100644 --- a/data/rules/aiven.yml +++ b/data/rules/aiven.yml @@ -9,7 +9,7 @@ rules: ( [a-z0-9/+=]{372} ) - (?:[^A-Za-z0-9/+=]|$) + (?:[^A-Za-z0-9/+=]) pattern_requirements: min_digits: 2 min_uppercase: 1 diff --git a/data/rules/aws.yml b/data/rules/aws.yml index 6cd938a..dda39b2 100644 --- a/data/rules/aws.yml +++ b/data/rules/aws.yml @@ -22,22 +22,23 @@ rules: pattern: | (?xi) (?: - \b + (?:[^A-Za-z0-9/+=]) (?:AWS|AMAZON|AMZN|A3T[A-Z0-9]|AKIA|AGPA|AIDA|AROA|AIPA|ANPA|ANVA|ASIA) (?:.|[\n\r]){0,64}? - \b - ([A-Za-z0-9/+]{40}) - \b + (?:[^A-Za-z0-9/+=]) + ([A-Za-z0-9+]{40}) + (?:[^A-Za-z0-9/+=]) | - \b(?:AWS|AMAZON|AMZN|A3T[A-Z0-9]|AKIA|AGPA|AIDA|AROA|AIPA|ANPA|ANVA|ASIA) + (?:[^A-Za-z0-9/+=]) + (?:AWS|AMAZON|AMZN|A3T[A-Z0-9]|AKIA|AGPA|AIDA|AROA|AIPA|ANPA|ANVA|ASIA) (?:.|[\n\r]){0,96}? (?:SECRET|PRIVATE|ACCESS) (?:.|[\n\r]){0,16}? (?:KEY|TOKEN) (?:.|[\n\r]){0,64}? \b - ([A-Za-z0-9/+]{40}) - \b + ([A-Za-z0-9+]{40}) + (?:[^A-Za-z0-9/+=]) ) pattern_requirements: min_digits: 2 diff --git a/data/rules/azurestorage.yml b/data/rules/azurestorage.yml index 75698f4..f577194 100644 --- a/data/rules/azurestorage.yml +++ b/data/rules/azurestorage.yml @@ -28,7 +28,7 @@ rules: id: kingfisher.azurestorage.2 pattern: | (?xi) - \b + (?:[^A-Za-z0-9/+=]) azure (?:.|[\n\r]){0,128}? (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN) diff --git a/data/rules/confluent.yml b/data/rules/confluent.yml index 86e7b13..a02b051 100644 --- a/data/rules/confluent.yml +++ b/data/rules/confluent.yml @@ -32,7 +32,7 @@ rules: ( [A-Z0-9\+/]{64} ) - (?:[^A-Za-z0-9/+=]|$) + (?:[^A-Za-z0-9/+=]) min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/github.yml b/data/rules/github.yml index 88f9a56..7941ac5 100644 --- a/data/rules/github.yml +++ b/data/rules/github.yml @@ -176,12 +176,14 @@ rules: pattern: | (?xi) github - .? - (?: api | app | application | client | consumer | customer | secret | key ) - .? - (?: key | oauth | sec | secret )? - .{0,2} \s{0,20} .{0,2} \s{0,20} .{0,2} - \b ([a-z0-9]{40}) \b + (?:.|[\n\r]){0,32}? + (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN) + (?:.|[\n\r]){0,32}? + (?:[^A-Za-z0-9/+=]) + ( + [a-z0-9]{40} + ) + (?:[^A-Za-z0-9/+=]) depends_on_rule: - rule_id: "kingfisher.github.5" variable: GITHUB_CLIENT_ID diff --git a/data/rules/google.yml b/data/rules/google.yml index c338245..51d5103 100644 --- a/data/rules/google.yml +++ b/data/rules/google.yml @@ -51,7 +51,7 @@ rules: pattern: | (?xi) (ya29\.[0-9A-Z_-]{20,1024}) - (?: [^0-9A-Z_-]|$) + (?: [^0-9A-Z_-]) pattern_requirements: min_digits: 2 min_uppercase: 1 diff --git a/data/rules/ibm.yml b/data/rules/ibm.yml index a2e7ec6..284e5fc 100644 --- a/data/rules/ibm.yml +++ b/data/rules/ibm.yml @@ -3,6 +3,7 @@ rules: id: kingfisher.ibm.1 pattern: | (?xi) + (?:[^A-Za-z0-9/+=]) (?:ibm(?:cloud)?|bx) (?:.|[\n\r]){0,32}? (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN) @@ -15,11 +16,9 @@ rules: min_digits: 2 min_entropy: 3.5 confidence: medium - examples: - ibmcloud_apikey = abcdef0123_56789abcdef0123456789abcdef01234 - ibm_platform_key="f-_RrJDVnuVh07HNTcmnQx_b6CbcQsxmEarVm9P_RWtF" - references: - https://cloud.ibm.com/docs/account?topic=account-userapikey - https://cloud.ibm.com/apidocs/iam-identity-token-api diff --git a/data/rules/jwt.yml b/data/rules/jwt.yml index b39d991..cbd5b46 100644 --- a/data/rules/jwt.yml +++ b/data/rules/jwt.yml @@ -10,7 +10,7 @@ rules: \. [A-Za-z0-9_-]{12,} (?# signature ) ) - (?:[^A-Z0-9_-]|$) + (?:[^A-Z0-9_-]) pattern_requirements: min_digits: 4 min_entropy: 3.3 diff --git a/data/rules/mapbox.yml b/data/rules/mapbox.yml index ecbfa29..26ab432 100644 --- a/data/rules/mapbox.yml +++ b/data/rules/mapbox.yml @@ -1,7 +1,7 @@ rules: - name: Mapbox Public Access Token id: kingfisher.mapbox.1 - pattern: '(?i)(?s)mapbox.{0,30}(pk\.[a-z0-9\-+/=]{32,128}\.[a-z0-9\-+/=]{20,30})(?:[^a-z0-9\-+/=]|$)' + pattern: '(?i)(?s)mapbox.{0,30}(pk\.[a-z0-9\-+/=]{32,128}\.[a-z0-9\-+/=]{20,30})(?:[^a-z0-9\-+/=])' pattern_requirements: min_digits: 2 min_entropy: 3.3 @@ -28,7 +28,7 @@ rules: - name: Mapbox Secret Access Token id: kingfisher.mapbox.2 - pattern: '(?i)(?s)mapbox.{0,30}(sk\.[a-z0-9\-+/=]{32,128}\.[a-z0-9\-+/=]{20,30})(?:[^a-z0-9\-+/=]|$)' + pattern: '(?i)(?s)mapbox.{0,30}(sk\.[a-z0-9\-+/=]{32,128}\.[a-z0-9\-+/=]{20,30})(?:[^a-z0-9\-+/=])' pattern_requirements: min_digits: 2 min_entropy: 3.3 @@ -54,7 +54,7 @@ rules: - name: Mapbox Temporary Access Token id: kingfisher.mapbox.3 - pattern: '(?i)(?s)mapbox.{0,30}(tk\.[a-z0-9\-+/=]{32,128}\.[a-z0-9\-+/=]{20,30})(?:[^a-z0-9\-+/=]|$)' + pattern: '(?i)(?s)mapbox.{0,30}(tk\.[a-z0-9\-+/=]{32,128}\.[a-z0-9\-+/=]{20,30})(?:[^a-z0-9\-+/=])' pattern_requirements: min_digits: 2 min_entropy: 3.3 diff --git a/data/rules/openweathermap.yml b/data/rules/openweathermap.yml index b3fa549..f2b2743 100644 --- a/data/rules/openweathermap.yml +++ b/data/rules/openweathermap.yml @@ -1,12 +1,9 @@ rules: - # --------------------------------------------------------------------- - # 1. OpenWeather Map API Key (detector unchanged, new validation) - # --------------------------------------------------------------------- - name: OpenWeather Map API Key id: kingfisher.openweather.1 pattern: | (?xi) - (?:pyowm|openweather|owm\b) + (?:pyowm|openweather|\bowm\b) (?:.|[\n\r]){0,64}? \b ( diff --git a/data/rules/pypi.yml b/data/rules/pypi.yml index dd11dd4..a0bd49d 100644 --- a/data/rules/pypi.yml +++ b/data/rules/pypi.yml @@ -7,7 +7,7 @@ rules: ( pypi-AgEIcHlwaS5vcmc[A-Z0-9_-]{50,} ) - (?:[^A-Z0-9_-]|$) + (?:[^A-Z0-9_-]) pattern_requirements: min_digits: 2 min_entropy: 3.3 diff --git a/data/rules/recaptcha.yml b/data/rules/recaptcha.yml index d513a8d..a1d1631 100644 --- a/data/rules/recaptcha.yml +++ b/data/rules/recaptcha.yml @@ -9,7 +9,7 @@ rules: ( 6l[c-f][a-z0-9_-].{36} ) - (?:[^A-Za-z0-9/]|$) + (?:[^A-Za-z0-9/]) pattern_requirements: min_digits: 3 min_entropy: 3 diff --git a/data/rules/sentry.yml b/data/rules/sentry.yml index 6e0d15e..b53c8af 100644 --- a/data/rules/sentry.yml +++ b/data/rules/sentry.yml @@ -6,11 +6,13 @@ rules: \b sentry (?:.|[\n\r]){0,32}? + (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN) + (?:.|[\n\r]){0,32}? \b ( [a-f0-9]{64} ) - \b + (?:[^A-Za-z0-9/+=]) pattern_requirements: min_digits: 2 min_entropy: 3.5 From 724690dba856af0d7c4fae1d67eab3cf24f422be Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Thu, 6 Nov 2025 09:11:24 -0800 Subject: [PATCH 09/19] updated ci --- data/rules/gitlab.yml | 2 +- data/rules/postgres.yml | 1 + data/rules/telegram.yml | 12 +++++++----- data/rules/uri.yml | 24 +++++++++++++----------- 4 files changed, 22 insertions(+), 17 deletions(-) diff --git a/data/rules/gitlab.yml b/data/rules/gitlab.yml index 08ea0ea..1cb6bd2 100644 --- a/data/rules/gitlab.yml +++ b/data/rules/gitlab.yml @@ -86,7 +86,7 @@ rules: id: kingfisher.gitlab.3 pattern: | (?xi) - \b + (?:[^A-Za-z0-9/+=]|\r\n|\\n) ( glptt-[0-9a-f]{40} ) diff --git a/data/rules/postgres.yml b/data/rules/postgres.yml index 27b54ea..bf4d361 100644 --- a/data/rules/postgres.yml +++ b/data/rules/postgres.yml @@ -28,6 +28,7 @@ rules: ignore_if_contains: - "@localhost" - "@127.0.0.1" + - "localhost:" min_entropy: 3.3 confidence: medium examples: diff --git a/data/rules/telegram.yml b/data/rules/telegram.yml index 8786d64..f77daf4 100644 --- a/data/rules/telegram.yml +++ b/data/rules/telegram.yml @@ -4,12 +4,14 @@ rules: pattern: | (?xi) \b + (?:telegram|tgram:) + (?:.|[\n\r]){0,32}? ( - [0-9]{8,10} + [0-9]{7,10} : [A-Z0-9_-]{35} ) - \b + (?:[^A-Za-z0-9/+=]) pattern_requirements: min_digits: 2 confidence: medium @@ -27,6 +29,6 @@ rules: words: - '"ok":true' examples: - - "110201543:AAHdqTcvCH1vGWJxfSeofSAs0K5PALDsawd" - - "508627689:AAEuLPKs-EhrjrYGnz60bnYNZqakf6HJxc0" - - "3628091811:BAG9RuJiqgOGIfFbOPBpAo6QhIJoD9mCdDs" + - "tgram://110201543:AAHdqTcvCH1vGWJxfSeofSAs0K5PALDsawd" + - "telegram: 508627689:AAEuLPKs-EhrjrYGnz60bnYNZqakf6HJxc0" + - "telegram token is 3628091811:BAG9RuJiqgOGIfFbOPBpAo6QhIJoD9mCdDs" \ No newline at end of file diff --git a/data/rules/uri.yml b/data/rules/uri.yml index e5542be..04c8057 100644 --- a/data/rules/uri.yml +++ b/data/rules/uri.yml @@ -3,17 +3,19 @@ rules: id: kingfisher.uri.1 pattern: | (?xi) - (https?):// - [a-z][a-z0-9+\-.]* - : - [a-z0-9\-._~%!$&'()*,;=]+ - @ - (?:[a-z0-9\-._~%]+|\[[a-f0-9:.]+\]|\[v[a-f0-9][a-z0-9\-._~%!$&'()*,;=:]+\]) - (:?[0-9]+)? - (?:/[a-z0-9\-._~%!$&'()*,;=:@]*)* # Match path - /? - (?:\?[a-z0-9\-._~%!$&'()*,;=:@/?]*)? - (?:\#[a-z0-9\-._~%!$&'()*,;=:@/?]*)? + ( + (?:https?):// + [A-Za-z](?:[A-Za-z0-9+\-._~!$&'()*+,;=]|%[0-9A-Fa-f]{2})* + : + (?:[A-Za-z0-9\-._~!$&'()*+,;=]|%[0-9A-Fa-f]{2})+ + @ + (?:[A-Za-z0-9\-._~%]+|\[[A-Fa-f0-9:.]+\]|\[v[A-Fa-f0-9][A-Za-z0-9\-._~%!$&'()*,;=:]+\]) + (:?[0-9]+)? + (?:/[A-Za-z0-9\-._~%!$&'()*,;=:@%]*)* + /? + (?:\?[A-Za-z0-9\-._~%!$&'()*,;=:@/?%]*)? + (?:\#[A-Za-z0-9\-._~%!$&'()*,;=:@/?%]*)? + ) min_entropy: 4.0 confidence: medium examples: From 5b9c4743a5911282d7ada8ba3094a1219acec521 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Thu, 6 Nov 2025 16:22:35 -0800 Subject: [PATCH 10/19] updated ci --- CHANGELOG.md | 1 + README.md | 4 +- data/rules/github.yml | 118 ++++++++++++++++++++++++++++++---------- data/rules/mongodb.yml | 4 -- data/rules/ngrok.yml | 6 +- data/rules/postgres.yml | 5 -- 6 files changed, 96 insertions(+), 42 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 884beb6..9f94263 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ All notable changes to this project will be documented in this file. ## [v1.62.0] +- This release is focused on further improving detection accuracy, before even attempting to validate findings. - Added `pattern_requirements` checks to rules, providing lightweight post-regex character-class validation without lookarounds. - Added an `ignore_if_contains` option to `PatternRequirements` to drop matches containing case-insensitive placeholder words, with tests covering the new behavior. - Updated rules to adopt the new `pattern_requirements` support. diff --git a/README.md b/README.md index 656e73c..95a4d68 100644 --- a/README.md +++ b/README.md @@ -67,6 +67,7 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md)) - [ Run Kingfisher in Docker](#-run-kingfisher-in-docker) - [🔐 Detection Rules at a Glance](#-detection-rules-at-a-glance) - [📝 Write Custom Rules!](#-write-custom-rules) + - [Pattern requirements and placeholder filtering](#pattern-requirements-and-placeholder-filtering) - [🎉 Usage](#-usage) - [Basic Examples](#basic-examples) - [Scan with secret validation](#scan-with-secret-validation) @@ -331,8 +332,7 @@ is independent: - `min_digits`, `min_uppercase`, `min_lowercase`, and `min_special_chars` enforce complexity thresholds. - `special_chars` lets you override the set of characters counted as "special" when `min_special_chars` is used. - `ignore_if_contains` lists case-insensitive substrings that should cause a match to be discarded (for example, to drop - `test`, `demo`, or `localhost` values). Kingfisher still accepts the legacy `exclude_words` key as an alias when loading - existing rule files. + `test`, `demo`, or `localhost` values). When a match is skipped because of `ignore_if_contains`, Kingfisher logs the event at the `DEBUG` level alongside the rule that was evaluated. If you need to keep those matches for a particular scan, pass `--no-ignore-if-contains` to `kingfisher scan` to diff --git a/data/rules/github.yml b/data/rules/github.yml index 7941ac5..f4c53d8 100644 --- a/data/rules/github.yml +++ b/data/rules/github.yml @@ -1,34 +1,17 @@ rules: - - name: GitHub Personal Access Token + - name: GitHub Personal Access Token - fine-grained permissions id: kingfisher.github.1 pattern: | - (?xi) - \b + (?xi) ( - (?: # for token prefixes - ghp| # Personal Access Token - gho| # OAuth Token - ghu| # GitHub App User-to-Server Token - ghs| # GitHub App Server-to-Server Token - ghr| # Refresh Token - github_pat # Alternative format for Personal Access Token - )_ - (?: # for token body - [a-z0-9_]{35,235} # 35 to 235 lowercase alphanumeric characters or underscores - ) - ) - \b + github_pat + [A-Z0-9]{80,84} + ) pattern_requirements: min_digits: 2 min_lowercase: 2 min_entropy: 3.5 examples: - - "GITHUB_KEY=ghp_XIxB7KMNdAr3zqWtQqhE94qglHqOzn1D1stg" - - "let g:gh_token='ghp_4U3LSowpDx8XvYE7A8GH56oxU5aWnY2mzIbV'" - - | - ## git developer settings - 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 - - "oauth_token: gho_fq75OMU7UVbS9pTZmoCCzJT6TM5d1w099FgG" - "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" references: - https://docs.github.com/en/rest/users?apiVersion=2022-11-28 @@ -52,8 +35,49 @@ rules: type: WordMatch words: - '"login"' - - name: GitHub OAuth Access Token + - name: GitHub Personal Access Token id: kingfisher.github.2 + pattern: | + (?xi) + \b + ( + ghp_ + [A-Z0-9]{36} + ) + pattern_requirements: + min_digits: 2 + min_lowercase: 2 + min_entropy: 3.5 + examples: + - "GITHUB_KEY=ghp_XIxB7KMNdAr3zqWtQqhE94qglHqOzn1D1stg" + - "let g:gh_token='ghp_4U3LSowpDx8XvYE7A8GH56oxU5aWnY2mzIbV'" + - | + ## git developer settings + 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 + references: + - https://docs.github.com/en/rest/users?apiVersion=2022-11-28 + validation: + type: Http + content: + request: + method: POST + url: https://api.github.com/graphql + headers: + Authorization: token {{ TOKEN }} + Accept: application/vnd.github+json + Content-Type: application/json + body: | + { + "query": "{ viewer { login } }" + } + response_matcher: + - report_response: true + - match_all_words: true + type: WordMatch + words: + - '"login"' + - name: GitHub OAuth Access Token + id: kingfisher.github.3 pattern: | (?xi) \b @@ -90,15 +114,49 @@ rules: type: WordMatch words: - '"login"' - - name: GitHub App Token - id: kingfisher.github.3 + - name: GitHub App User-to-Server Token + id: kingfisher.github.4 pattern: | (?xi) ( - (?:ghu|ghs)_[A-Z0-9]{36} + ghu_[A-Z0-9]{36} ) examples: - ' "token": "ghu_16C7e42F292c69C2E7C10c838347Ae178B4a",' + - | + Example usage: + git clone http://ghu_RguXIkihJjwHAP6eXEYxaPNvywurTr5IOAbg@github.com/username/repo.git + references: + - https://docs.github.com/en/rest/users?apiVersion=2022-11-28 + validation: + type: Http + content: + request: + method: POST + url: https://api.github.com/graphql + headers: + Authorization: token {{ TOKEN }} + Accept: application/vnd.github+json + Content-Type: application/json + body: | + { + "query": "{ viewer { login } }" + } + response_matcher: + - report_response: true + - match_all_words: true + type: WordMatch + words: + - '"login"' + - name: GitHub App Server-to-Server Token + id: kingfisher.github.5 + pattern: | + (?xi) + ( + ghs_[A-Z0-9]{36} + ) + examples: + - ' "token": "ghs_16C7e42F292c69C2E7C10c838347Ae178B4a",' - | Example usage: git clone http://ghs_RguXIkihJjwHAP6eXEYxaPNvywurTr5IOAbg@github.com/username/repo.git @@ -125,7 +183,7 @@ rules: words: - '"login"' - name: GitHub Refresh Token - id: kingfisher.github.4 + id: kingfisher.github.6 pattern: | (?xi) ( @@ -156,7 +214,7 @@ rules: words: - '"login"' - name: GitHub Client ID - id: kingfisher.github.5 + id: kingfisher.github.7 pattern: | (?xi) (?:github) @@ -172,7 +230,7 @@ rules: GITHUB_CLIENT_ID=ac58d6da7d7a84c039b7 GITHUB_SECRET=37d02377a3e9d849e18704c3ec883f9c5787d857 - name: GitHub Secret Key - id: kingfisher.github.6 + id: kingfisher.github.8 pattern: | (?xi) github @@ -209,7 +267,7 @@ rules: GITHUB_CLIENT_ID=ac58d6da7d7a84c039b7 GITHUB_SECRET=37d02377a3e9d849e18704c3ec883f9c5787d857 - name: GitHub Personal Access Token (fine-grained permissions) - id: kingfisher.github.7 + id: kingfisher.github.9 pattern: | (?xi) ( diff --git a/data/rules/mongodb.yml b/data/rules/mongodb.yml index e70adec..28b4031 100644 --- a/data/rules/mongodb.yml +++ b/data/rules/mongodb.yml @@ -81,10 +81,6 @@ rules: mongodb(?:\+srv)?://[\S]{3,50}:(?:[\S]{3,88})@[-.%\w/:]+ ) \b - pattern_requirements: - ignore_if_contains: - - "@localhost" - - "@127.0.0.1" min_entropy: 3 examples: - client = mongoc_client_new ("mongodb+srv://someuser:hunter2@my-atlas-rd941.mongodb.net/test?retryWrites=true&w=majority"); diff --git a/data/rules/ngrok.yml b/data/rules/ngrok.yml index 951cfd5..be45f08 100644 --- a/data/rules/ngrok.yml +++ b/data/rules/ngrok.yml @@ -7,7 +7,11 @@ rules: (?:.|[\\n\r]){0,32}? (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN) (?:.|[\n\r]){0,32}? - (?:[a-z0-9]{25,30}_\d[a-z0-9]{20}|(?:cr_|ak_)[a-z0-9]{25,30}) + ( + (?:[a-z0-9]{25,30}_\d[a-z0-9]{20} + | + (?:cr_|ak_)[a-z0-9]{25,30}) + ) \b min_entropy: 4 examples: diff --git a/data/rules/postgres.yml b/data/rules/postgres.yml index bf4d361..5cbb4ba 100644 --- a/data/rules/postgres.yml +++ b/data/rules/postgres.yml @@ -24,11 +24,6 @@ rules: (?: \d+ ) - pattern_requirements: - ignore_if_contains: - - "@localhost" - - "@127.0.0.1" - - "localhost:" min_entropy: 3.3 confidence: medium examples: From 12730bb609859212cc49100d78ebace93d6c3a81 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Fri, 7 Nov 2025 16:31:24 -0800 Subject: [PATCH 11/19] Added checksum comparisons to pattern_requirements, new suffix, crc32, and base62 Liquid filters, and verbose logging so mismatched checksums are skipped with context rather than reported as findings. --- CHANGELOG.md | 2 + CHANGELOG.md.orig | 288 +++++++++ CHANGELOG.md.rej | 36 ++ Cargo.toml | 7 +- README.md | 9 +- README.md.orig | 1336 +++++++++++++++++++++++++++++++++++++++ README.md.rej | 68 ++ data/rules/github.yml | 28 +- docs/RULES.md | 15 +- src/findings_store.rs | 17 +- src/liquid_filters.rs | 219 ++++++- src/main.rs | 30 +- src/matcher.rs | 81 ++- src/rules/rule.rs | 259 +++++++- src/validation.rs | 57 +- src/validation/utils.rs | 23 +- 16 files changed, 2354 insertions(+), 121 deletions(-) create mode 100644 CHANGELOG.md.orig create mode 100644 CHANGELOG.md.rej create mode 100644 README.md.orig create mode 100644 README.md.rej diff --git a/CHANGELOG.md b/CHANGELOG.md index 9f94263..dabda9a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ All notable changes to this project will be documented in this file. - Added `pattern_requirements` checks to rules, providing lightweight post-regex character-class validation without lookarounds. - Added an `ignore_if_contains` option to `PatternRequirements` to drop matches containing case-insensitive placeholder words, with tests covering the new behavior. - Updated rules to adopt the new `pattern_requirements` support. +- Added checksum comparisons to `pattern_requirements`, new `suffix`, `crc32`, and `base62` Liquid filters, and verbose logging so mismatched checksums are skipped with context rather than reported as findings. +- Split GitHub token detections into fine-grained/fixed-format variants and enforce checksum validation for modern GitHub token families (PAT, OAuth, App, refresh) while preserving legacy coverage. - Automatically enable `--no-dedup` when `--manage-baseline` is supplied so baseline management keeps every finding. diff --git a/CHANGELOG.md.orig b/CHANGELOG.md.orig new file mode 100644 index 0000000..9f94263 --- /dev/null +++ b/CHANGELOG.md.orig @@ -0,0 +1,288 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +## [v1.62.0] +- This release is focused on further improving detection accuracy, before even attempting to validate findings. +- Added `pattern_requirements` checks to rules, providing lightweight post-regex character-class validation without lookarounds. +- Added an `ignore_if_contains` option to `PatternRequirements` to drop matches containing case-insensitive placeholder words, with tests covering the new behavior. +- Updated rules to adopt the new `pattern_requirements` support. +- Automatically enable `--no-dedup` when `--manage-baseline` is supplied so baseline management keeps every finding. + + +## [v1.61.0] +- Fixed local filesystem scans to keep `open_path_as_is` enabled when opening Git repositories and only disable it for diff-based scans. +- Created Linux and Windows specific installer script +- Updated diff-focused scanning so `--branch-root-commit` can be provided alongside `--branch`, letting you diff from a chosen commit while targeting a specific branch tip (still defaulting back to the `--branch` ref when the commit is omitted). +- Updated rules + +## [v1.60.0] +- Removed the `--bitbucket-username`, `--bitbucket-token`, and `--bitbucket-oauth-token` flags in favour of `KF_BITBUCKET_*` environment variables when authenticating to Bitbucket. +- Added provider-specific `kingfisher scan` subcommands (for example `kingfisher scan github …`) that translate into the legacy flags under the hood. The new layout keeps backwards compatibility while removing the wall of provider options from `kingfisher scan --help`. +- Updated the README so every provider example (GitHub, GitLab, Bitbucket, Azure Repos, Gitea, Hugging Face, Slack, Jira, Confluence, S3, GCS, Docker) uses the new subcommand style. +- Legacy provider flags (for example `--github-user`, `--gitlab-group`, `--bitbucket-workspace`, `--s3-bucket`) still work but now emit a deprecation warning to encourage migration to the new `kingfisher scan ` flow. +- Kept the direct `kingfisher scan /path/to/dir` flow for local filesystem / local git repo scans while adding a `--list-only` switch to each provider subcommand so repository enumeration no longer requires the standalone `github repos`, `gitlab repos`, etc. commands. +- Removed the legacy top-level provider commands (`kingfisher github`, `kingfisher gitlab`, `kingfisher gitea`, `kingfisher bitbucket`, `kingfisher azure`, `kingfisher huggingface`) now that enumeration lives under `kingfisher scan --list-only`. + +## [v1.59.0] +- Fixed `kingfisher scan github …` (and other provider-specific subcommands) so they no longer demand placeholder path arguments before the CLI accepts the request. +- Fixed `kingfisher scan` so that providing `--branch` without `--since-commit` now diffs the branch against the empty tree and scans every commit reachable from that branch. +- Added rules for meraki, duffel, finnhub, frameio, freshbooks, gitter, infracost, launchdarkly, lob, maxmind, messagebird, nytimes, prefect, scalingo, sendinblue, sentry, shippo, twitch, typeform + +- ## [v1.58.0] +- Added first-class Hugging Face scanning support, including CLI enumeration, token authentication, and integration with remote scans. +- Condensed GitError formatting to report the exit status and the first informative lines from stdout/stderr, producing concise git clone failure logs. +- Added support for scanning Google Cloud Storage buckets via `--gcs-bucket`, including optional prefixes and service-account authentication. +- Added `--skip-aws-account` (now accepting comma-separated values) and `--skip-aws-account-file` to bypass live AWS validation for known canary/honey-token account IDs without triggering alerts. Kingfisher now ships with several canary AWS account IDs pre-seeded in the skip list and now reports matching findings as "Not Attempted" with the "Response" containing "(skip list entry)" so it's clear that validation was intentionally skipped and why. + +## [v1.57.0] +- Added inline ignore directive detection to treat suppression tokens anywhere on surrounding lines, including multi-line handling +- Added a `--no-ignore` CLI flag to disable inline directives when you need every potential secret reported +- Added: repeatable `--ignore-comment ` flag to reuse inline directives from other scanners (for example `NOSONAR`, `kics-scan ignore`, `gitleaks:allow`, etc) +- Respect user color settings in update messages by using the same color helper as the main reporter, ensuring consistent output and no ANSI codes on update check, when color is disabled + +## [v1.56.0] +- Fixed tree-sitter scanning bug where passing --no-base64 caused errors to be printed when the file type couldn’t be determined + +## [v1.55.0] +- Added first-class Azure Repos support, including CLI commands, enumeration, and documentation updates +- Improved performance of tree-sitter parsing +- Updated Windows build script to ensure static binary is produced + +## [v1.54.0] +- Added first-class Gitea support, including CLI commands, environment-based authentication, documentation, and integration with scans and repository enumeration. +- Populate the finding path from git blob metadata so history-derived secrets display their file location instead of an empty path +- Replaced Match::finding_id’s SHA1-based hashing with a fast xxh3_64 digest that keeps IDs deterministic while eliminating a hot-path SHA1 dependency + +## [v1.53.0] +- Added first-class Bitbucket support, including CLI commands, authentication helpers, documentation, and integration testing. + +## [v1.52.0] +- Enabled ANSI formatting in the tracing formatter whenever stderr is attached to a terminal so colorized updater messages render correctly instead of showing escape sequences. +- Added a new CLI flag, `--user-agent-suffix` to allow developers to append additional information to the user-agent +- Removed the unused --rlimit-nofile flag + +## [1.51.0] +- Added diff-only Git scanning via `--since-commit` and `--branch`, including remote-aware ref resolution so CI jobs can pair `--git-url` clones with pull request branches + +## [1.50.0] +- Added `--github-exclude` and `--gitlab-exclude` options to skip specific repositories when scanning or listing GitHub and GitLab sources, including support for gitignore-style glob patterns + +## [1.49.0] +- Enabled MongoDB URI validation +- AWS + GCP validators now respect HTTPS_PROXY and share a consistent user agent across AWS, GCP, and HTTP validation +- Increase max-file-size default to 256 mb (up from 64 mb) +- Improved AWS rule + +## [1.48.0] +- Improved error message when self-update cannot find the current binary +- Optimized memory usage via string interning and extensive data sharing +- Replaced quadratic match filtering with a per-rule span map, fixing missed secrets in extremely large files and improving scan performance +- Support scanning extremely large files by chunking input into 1 GiB segments with small overlaps, avoiding vectorscan buffer limits while preserving match offsets +- Always use chunked vectorscan, eliminating the slow regex fallback for blobs over 4 GiB +- Skip Base64 scanning for blobs over 64 MB to avoid a second pass over massive files +- Increased max-file-size default to 64 MB (up from 25 MB) + +## [1.47.0] +- MongoDB validator now validates `mongodb+srv://` URIs with a fast timeout instead of skipping them +- Improved rules: github oauth2, diffbot, mailchimp, aws +- Added validation to SauceLabs rule +- Added rules: shodan, bitly, flickr +- Decode Base64 blobs and scan their contents for secrets while skipping short strings for performance. This has a small performance impact and can be disabled with `--no-base64` + +## [1.46.0] +- Improved rules: AWS, pem +- Added rule for Ollama, Weights and Biases, Cerebras, Friendli, Fireworks.ai, NVIDIA NIM, together.ai, zhipu +- Added `self-update` command to update the binary independently. Now supports updating over homebrew managed binary +- MongoDB validator now checks `mongodb+srv://` URIs with fast-fail timeouts + +## [1.45.0] +- Added `--repo-artifacts` flag to scan repository issues, gists/snippets, and wikis when cloning via `--git-url` +- Added rules for sendbird, mattermost, langchain, notion +- JWT validation hardened to reject alg:none by default (only allowed if explicitly configured), require iss for OIDC/JWKS verification, ensuring "Active Credential" means cryptographically verified and time-valid, not just unexpired +- Updated the Git cloning logic to include all refs and minimize clone output, allowing Kingfisher to analyze pull request and deleted branch history + +## [1.44.0] +- Fixed issue with self-update on Linux +- Reverted the change to json and jsonl outputs by rule +- Added `--skip-regex` and `--skip-word` flags to ignore secrets matching custom patterns or skipwords + +## [1.43.0] +- Added rules for clearbit, kickbox, azure container registry, improved Azure Storage key +- Grouped JSON and JSONL outputs by rule, restoring `matches` arrays in reports + +## [1.42.0] +- Fixed pagination issue when calling gitlab api +- Expanded directory exclusion handling to interpret plain patterns as prefixes, ensuring options like --exclude .git also skip all nested paths +- Updated baseline management to track encountered findings and remove entries that are no longer present, saving the baseline file whenever entries are pruned or new matches are added +- Added rules for authress, clickhouse, codecov, contentful, curl, dropbox, fly.io, hubspot, firecrawl +- Internal refactoring of rule loader, git enumerator, and filetype guesser +- Improved language detection + +## [1.41.0] +- Added support for scanning gitlab subgroups, with `kingfisher scan --gitlab-group my-group --gitlab-include-subgroups` +- Added rule for Vercel + +## [1.40.0] +- Dropped the “prevalidated” flag from rule definitions and validation logic so every finding now flows through the standard active/inactive/unknown pipeline, simplifying rule configuration and preventing special‑case bypasses +- Improved Tailscale api key detectors + +## [1.39.0] +- Added support for scanning Confluence pages via `--confluence-url` and `--cql` + +## [1.38.0] +- `--quiet` now suppresses scan summaries and rule statistics unless `--rule-stats` is explicitly provided +- Added X Consumer key detection and validation + +## [1.37.0] +- GitLab: Matched GitLab group repository listings to glab by only enumerating projects that belong directly to each group, without automatically traversing subgroups + +## [1.36.0] +- Fixed GitHub organization and GitLab group scans when using `--git-history=none` +- JWT tokens without both `iss` and `aud` are no longer reported as active credentials + +## [1.35.0] +- Remote scans with `--git-history=none` now clone repositories with a working tree and scan the current files instead of erroring with "No inputs to scan". +- Fixed issue where `--redact` did not function properly +- Fixed validation logic for clarifai rule + +## [1.34.0] +- Use system TLS root certificates to support self-hosted GitLab instances with internal CAs +- Added new rule: Coze personal access token +- Updated Supabase rule to detect project url's and validate their corresponding tokens + +## [1.33.0] +- Fixed header precedence so custom HTTP validation headers like `Accept` are preserved +- Added new Heroku rule + +## [1.32.0] +- Added support for scanning AWS S3 buckets via `--s3-bucket` and optional `--s3-prefix` +- Added `--role-arn` and `--aws-local-profile` flags for S3 authentication alongside `KF_AWS_KEY`/`KF_AWS_SECRET` +- Added progress bar for scanning s3 buckets +- Refactored output reporting and formatting logic + +## [1.31.0] +- New rules: Telegram bot token, OpenWeatherMap, Apify, Groq +- New OpenAI detectors added (@joshlarsen) +- Fixed bug that broke validation when using unnamed group captures + +## [1.30.0] +- Fixed validation caching for HTTP validators to include rendered headers so inactive secrets no longer appear active. +- Removed pre-commit installation hook, due to bugs + +## [1.29.0] +- Fixed issue when more than 1 named capture group is used in a rule variable +- Added a new liquid template filters: `b64dec` +- Added custom validator for Coinbase, and a Coinbase rule that uses it + +## [1.28.0] +- Added support for scanning Slack + +## [1.27.0] +- Added Buildkite rule +- Added support for scanning Docker images via `--docker-image` + +## [1.26.0] +- Added rule for ElevenLabs +- Added support for scanning Jira issues via a given JQL (Jira Query Language) + +## [1.25.0] +- Fixed GitLab authentication bug +- Added pre-commit and pre-receive installation hooks +- MongoDB validator now skips `mongodb+srv://` URIs and returns a message that validation was skipped +- Fixed noisy Baseten rule + +## [1.24.0] +- Now generating DEB and RPM packages +- Now releasing Docker images, and updated README +- Added rule for Scale, Deepgram, AssemblyAI + + +## [1.23.0] +- Updating GitHub Action to generate Docker image +- Added rules for Diffbot, ai21, baseten +- Fixed supabase rule +- Added 'alg' to JWT validation output + +## [1.22.0] +- Added rules for Google Gemini AI, Cohere, Stability.ai, Replicate, Runway, Clarifai +- Upgraded dependencies + +## [1.21.0] +- Improved Azure Storage rule +- Added rule to detect TravisCI encrypted values +- Added baseline feature with `--baseline-file` and `--manage-baseline` flags +- Introduced `--exclude` option for skipping paths +- Added tests covering baseline and exclude workflow +- Added validation for JWT tokens that checks `exp` and `nbf` claims +- JWT validation performs OpenID Connect discovery using the `iss` claim and verifies signatures via JWKS +- Removed `--ignore-tests` argument, because the `--exclude` flag provides more granular functionality +- DigitalOcean rule update +- Adafruit rule update + +## [1.20.0] +- Removed confirmation prompt when user provides --self-update flag +- Added support for HTTP request bodies in rule validation +- Added new liquid-rs filters: HmacSha1, IsoTimestampNoFracFilter, Replace +- Added rules for mistral, perplexity +- Added validation for Alibaba rule +- Set GIT_TERMINAL_PROMPT=0 when cloning git repos + +## [1.19.0] +- JSON output was missing committer name and email +- Fixed Gitlab rule which was incorrectly identifying certain tokens as valid + +## [1.18.1] +- Restored --version cli argument +- Added test for the argument + +## [1.18.0] +- Added rules for DeepSeek, xAI +- Removed branding +- Added NOTICE file + +## [1.17.1] +- Fixed broken sourcegraph rule +- Added test to prevent this and similar issues + +## [1.17.0] +- Updated README to give proper attribution to Nosey Parker! +- Added rules for sonarcloud, sonarqube, sourcegraph, shopify, truenas, square, sendgrid, nasa, teamcity, truenas, shopify +- Introduced `--ignore-tests` flag – skip files/dirs whose path resembles tests (`test`, `spec`, `fixture`, `example`, `sample`), reducing noise. +## [1.16.0] +- Fix: HTML detection now requires both HTML content-type and "` flow. + - Kept the direct `kingfisher scan /path/to/dir` flow for local filesystem / local git repo scans while adding a `--list-only` switch to each provider subcommand so repository enumeration no longer requires the standalone `github repos`, `gitlab repos`, etc. commands. + - Removed the legacy top-level provider commands (`kingfisher github`, `kingfisher gitlab`, `kingfisher gitea`, `kingfisher bitbucket`, `kingfisher azure`, `kingfisher huggingface`) now that enumeration lives under `kingfisher scan --list-only`. + + ## [v1.59.0] + - Fixed `kingfisher scan github …` (and other provider-specific subcommands) so they no longer demand placeholder path arguments before the CLI accepts the request. + - Fixed `kingfisher scan` so that providing `--branch` without `--since-commit` now diffs the branch against the empty tree and scans every commit reachable from that branch. + - Added rules for meraki, duffel, finnhub, frameio, freshbooks, gitter, infracost, launchdarkly, lob, maxmind, messagebird, nytimes, prefect, scalingo, sendinblue, sentry, shippo, twitch, typeform + + - ## [v1.58.0] + - Added first-class Hugging Face scanning support, including CLI enumeration, token authentication, and integration with remote scans. + - Condensed GitError formatting to report the exit status and the first informative lines from stdout/stderr, producing concise git clone failure logs. + - Added support for scanning Google Cloud Storage buckets via `--gcs-bucket`, including optional prefixes and service-account authentication. diff --git a/Cargo.toml b/Cargo.toml index abb87db..44afc1d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -75,9 +75,9 @@ include_dir = { version = "0.7", features = ["glob"] } strum = { version = "0.26", features = ["derive"] } sysinfo = "0.31.4" reqwest = { version = "0.12", default-features = false, features = [ - "json", - "gzip", - "brotli", + "json", + "gzip", + "brotli", "deflate", "stream", "rustls-tls", @@ -196,6 +196,7 @@ gcloud-storage = { version = "1.1.1", default-features = false, features = [ "auth", ] } tokei = "12.1.2" +crc32fast = "1.4.0" [target.'cfg(not(windows))'.dependencies] sha1 = { version = "0.10.6", features = ["asm"] } diff --git a/README.md b/README.md index 95a4d68..72736b9 100644 --- a/README.md +++ b/README.md @@ -333,10 +333,13 @@ is independent: - `special_chars` lets you override the set of characters counted as "special" when `min_special_chars` is used. - `ignore_if_contains` lists case-insensitive substrings that should cause a match to be discarded (for example, to drop `test`, `demo`, or `localhost` values). +- `checksum` lets you compare an extracted portion of the match against a Liquid-rendered expectation. Provide `actual.template` + and `expected` Liquid snippets (with access to `{{ MATCH }}`, `{{ FULL_MATCH }}`, and any named capture as both its original + case and uppercase alias) and Kingfisher will skip the finding when the rendered values differ. Optional keys such as + `requires_capture` and `skip_if_missing` help you guard against legacy formats while onboarding the checksum-aware variant. -When a match is skipped because of `ignore_if_contains`, Kingfisher logs the event at the `DEBUG` level alongside the rule that -was evaluated. If you need to keep those matches for a particular scan, pass `--no-ignore-if-contains` to `kingfisher scan` to -disable the substring filter without editing any rule files. +When a match is skipped because of `ignore_if_contains` or a checksum mismatch, Kingfisher logs the event at the `DEBUG` level alongside the rule that was evaluated. If you need to keep those matches for a particular scan, pass `--no-ignore-if-contains` to `kingfisher scan` to disable the substring filter without editing any rule files. Verbose mode (`-v`) will also show you the +checksum mismatch lengths so you can confirm why a finding was suppressed. Once you've done that, you can provide your custom rules (defined in a YAML file) and provide it to Kingfisher at runtime --- no recompiling required! diff --git a/README.md.orig b/README.md.orig new file mode 100644 index 0000000..95a4d68 --- /dev/null +++ b/README.md.orig @@ -0,0 +1,1336 @@ +# Kingfisher + +

+ Kingfisher Logo + +[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
+[![ghcr downloads](https://ghcr-badge.elias.eu.org/shield/mongodb/kingfisher/kingfisher)](https://github.com/mongodb/kingfisher/pkgs/container/kingfisher)
+ + +Kingfisher is a blazingly fast secret-scanning and **live validation** tool built in Rust. + +It combines Intel’s SIMD-accelerated regex engine (Hyperscan) with language-aware parsing to achieve high accuracy at massive scale, and **ships with hundreds of built-in rules** to detect, **validate**, and triage secrets before they ever reach production. + +For a look at how Kingfisher has grown from its early foundations into today's full-featured scanner, see [Lineage and Evolution](#lineage-and-evolution). + +

+ +## Key Features + +### Multiple Scan Targets +
+ +| Files / Dirs | Local Git | GitHub | GitLab | Azure Repos | Bitbucket | Gitea | Hugging Face | +|:-------------:|:----------:|:------:|:------:|:-------------:|:----------:|:------:|:-------------:| +| Files / Dirs
Files / Dirs | Local Git
Local Git | GitHub
GitHub | GitLab
GitLab | Azure Repos
Azure Repos | Bitbucket
Bitbucket | Gitea
Gitea |Hugging Face
Hugging Face | + +| Docker | Jira | Confluence | Slack | AWS S3 | Google Cloud | +|:------:|:----:|:-----------:|:-----:|:------:|:---:| +| Docker
Docker | Jira
Jira | Confluence
Confluence | Slack
Slack | AWS S3
AWS S3 | Google Cloud Storage
Cloud Storage | + +
+ +### Performance, Accuracy, and Hundreds of Rules +- **Performance**: multithreaded, Hyperscan‑powered scanning built for huge codebases +- **Extensible rules**: hundreds of built-in detectors plus YAML-defined custom rules ([docs/RULES.md](/docs/RULES.md)) +- **Broad AI SaaS coverage**: finds and validates tokens for OpenAI, Anthropic, Google Gemini, Cohere, Mistral, Stability AI, Replicate, xAI (Grok), Ollama, Langchain, Perplexity, Weights & Biases, Cerebras, Friendli, Fireworks.ai, NVIDIA NIM, Together.ai, Zhipu, and many more +- **Compressed Files**: Supports extracting and scanning compressed files for secrets +- **Baseline management**: generate and track baselines to suppress known secrets ([docs/BASELINE.md](/docs/BASELINE.md)) + +**Learn more:** [Introducing Kingfisher: Real‑Time Secret Detection and Validation](https://www.mongodb.com/blog/post/product-release-announcements/introducing-kingfisher-real-time-secret-detection-validation) + +# Benchmark Results + +See ([docs/COMPARISON.md](docs/COMPARISON.md)) + +

+ Kingfisher Runtime Comparison +

+ +# Table of Contents + +
+ +- [Kingfisher](#kingfisher) + - [Key Features](#key-features) + - [Multiple Scan Targets](#multiple-scan-targets) + - [Performance, Accuracy, and Hundreds of Rules](#performance-accuracy-and-hundreds-of-rules) +- [Benchmark Results](#benchmark-results) +- [Table of Contents](#table-of-contents) +- [Getting Started](#getting-started) + - [Installation](#installation) + - [Pre-built Releases](#pre-built-releases) + - [Homebrew](#homebrew) + - [Linux and macOS](#linux-and-macos) + - [Windows](#windows) + - [Compile](#compile) + - [ Run Kingfisher in Docker](#-run-kingfisher-in-docker) +- [🔐 Detection Rules at a Glance](#-detection-rules-at-a-glance) + - [📝 Write Custom Rules!](#-write-custom-rules) + - [Pattern requirements and placeholder filtering](#pattern-requirements-and-placeholder-filtering) +- [🎉 Usage](#-usage) + - [Basic Examples](#basic-examples) + - [Scan with secret validation](#scan-with-secret-validation) + - [Scan a directory containing multiple Git repositories](#scan-a-directory-containing-multiple-git-repositories) + - [Scan a Git repository without validation](#scan-a-git-repository-without-validation) + - [Display only secrets confirmed active by third‑party APIs](#display-only-secrets-confirmed-active-by-thirdparty-apis) + - [Output JSON and capture to a file](#output-json-and-capture-to-a-file) + - [Output SARIF directly to disk](#output-sarif-directly-to-disk) + - [Pipe any text directly into Kingfisher by passing `-`](#pipe-any-text-directly-into-kingfisher-by-passing--) + - [Limit maximum file size scanned (`--max-file-size`)](#limit-maximum-file-size-scanned---max-file-size) + - [Scan using a rule _family_ with one flag](#scan-using-a-rule-family-with-one-flag) + - [Display rule performance statistics](#display-rule-performance-statistics) + - [Scan while ignoring likely test files](#scan-while-ignoring-likely-test-files) + - [Exclude specific paths](#exclude-specific-paths) + - [Scan changes in CI pipelines](#scan-changes-in-ci-pipelines) + - [ Scanning an AWS S3 Bucket](#-scanning-an-aws-s3-bucket) + - [ Scanning a Google Cloud Storage Bucket](#-scanning-a-google-cloud-storage-bucket) + - [ Scanning Docker Images](#-scanning-docker-images) + - [ Scanning GitHub](#-scanning-github) + - [Scan GitHub organization (requires `KF_GITHUB_TOKEN`)](#scan-github-organization-requires-kf_github_token) + - [Skip specific GitHub repositories during enumeration](#skip-specific-github-repositories-during-enumeration) + - [Scan remote GitHub repository](#scan-remote-github-repository) + - [ Scanning GitLab](#-scanning-gitlab) + - [Scan GitLab group (requires `KF_GITLAB_TOKEN`)](#scan-gitlab-group-requires-kf_gitlab_token) + - [Scan GitLab user](#scan-gitlab-user) + - [Skip specific GitLab projects during enumeration](#skip-specific-gitlab-projects-during-enumeration) + - [Scan remote GitLab repository by URL](#scan-remote-gitlab-repository-by-url) + - [List GitLab repositories](#list-gitlab-repositories) + - [ Scanning Azure Repos](#-scanning-azure-repos) + - [Scan Azure Repos organization or collection (requires `KF_AZURE_TOKEN` or `KF_AZURE_PAT`)](#scan-azure-repos-organization-or-collection-requires-kf_azure_token-or-kf_azure_pat) + - [Scan specific Azure Repos projects](#scan-specific-azure-repos-projects) + - [Skip specific Azure repositories during enumeration](#skip-specific-azure-repositories-during-enumeration) + - [List Azure repositories](#list-azure-repositories) + - [ Scanning Gitea](#-scanning-gitea) + - [Scan Gitea organization (requires `KF_GITEA_TOKEN`)](#scan-gitea-organization-requires-kf_gitea_token) + - [Scan Gitea user](#scan-gitea-user) + - [Skip specific Gitea repositories during enumeration](#skip-specific-gitea-repositories-during-enumeration) + - [Scan remote Gitea repository by URL](#scan-remote-gitea-repository-by-url) + - [List Gitea repositories](#list-gitea-repositories) + - [ Scanning Bitbucket](#-scanning-bitbucket) + - [Scan Bitbucket workspace](#scan-bitbucket-workspace) + - [Scan Bitbucket user](#scan-bitbucket-user) + - [Skip specific Bitbucket repositories during enumeration](#skip-specific-bitbucket-repositories-during-enumeration) + - [Scan remote Bitbucket repository by URL](#scan-remote-bitbucket-repository-by-url) + - [List Bitbucket repositories](#list-bitbucket-repositories) + - [Authenticate to Bitbucket](#authenticate-to-bitbucket) + - [Self-hosted Bitbucket Server](#self-hosted-bitbucket-server) + - [ Scanning Hugging Face](#-scanning-hugging-face) + - [Scan Hugging Face user](#scan-hugging-face-user) + - [Scan Hugging Face organization](#scan-hugging-face-organization) + - [Scan specific Hugging Face resources](#scan-specific-hugging-face-resources) + - [List Hugging Face repositories](#list-hugging-face-repositories) + - [Authenticate to Hugging Face](#authenticate-to-hugging-face) + - [ Scanning Jira](#-scanning-jira) + - [Scan Jira issues matching a JQL query](#scan-jira-issues-matching-a-jql-query) + - [Scan the last 1,000 Jira issues:](#scan-the-last-1000-jira-issues) + - [ Scanning Confluence](#-scanning-confluence) + - [Scan Confluence pages matching a CQL query](#scan-confluence-pages-matching-a-cql-query) + - [ Scanning Slack](#-scanning-slack) + - [Scan Slack messages matching a search query](#scan-slack-messages-matching-a-search-query) + - [Environment Variables for Tokens](#environment-variables-for-tokens) + - [Exit Codes](#exit-codes) + - [Update Checks](#update-checks) +- [🤓 Advanced Options](#-advanced-options) + - [Build a Baseline / Detect New Secrets](#build-a-baseline--detect-new-secrets) + - [List Builtin Rules](#list-builtin-rules) + - [To scan using **only** your own `my_rules.yaml` you could run:](#to-scan-using-only-your-own-my_rulesyaml-you-could-run) + - [To add your rules alongside the built‑ins:](#to-add-your-rules-alongside-the-builtins) + - [Other Examples](#other-examples) + - [Customize the HTTP User-Agent](#customize-the-http-user-agent) + - [Notable Scan Options](#notable-scan-options) + - [Understanding `--confidence`](#understanding---confidence) + - [Ignore known false positives](#ignore-known-false-positives) + - [Skip Canary Tokens (AWS)](#skip-canary-tokens-aws) + - [Common CLI flows](#common-cli-flows) + - [Inline ignore directives](#inline-ignore-directives) + - [Finding Fingerprint](#finding-fingerprint) + - [Rule Performance Profiling](#rule-performance-profiling) + - [CLI Options](#cli-options) + - [Lineage and Evolution](#lineage-and-evolution) +- [Roadmap](#roadmap) +- [License](#license) + +
+ + +# Getting Started +## Installation +### Pre-built Releases +Pre-built binaries are available from the [Releases](https://github.com/mongodb/kingfisher/releases) section. + +### Homebrew +![Homebrew Formula Version](https://img.shields.io/homebrew/v/kingfisher) + +```bash +brew install kingfisher +``` + +### Linux and macOS + +
+ +Use the bundled installer script to fetch the latest release and place it in +`~/.local/bin` (or a directory of your choice): + +```bash +# Linux, macOS +curl --silent --location \ + https://raw.githubusercontent.com/mongodb/kingfisher/main/scripts/install-kingfisher.sh | \ + bash +``` + +To install into a custom location, pass the desired directory as an argument: + +```bash +curl --silent --location \ + https://raw.githubusercontent.com/mongodb/kingfisher/main/scripts/install-kingfisher.sh | \ + bash -s -- /opt/kingfisher +``` + +
+ +### Windows + +
+ +Download and run the PowerShell installer to place the binary in +`$env:USERPROFILE\bin` (or another directory you specify): + +```powershell +# Windows +Set-ExecutionPolicy -Scope Process -ExecutionPolicy Bypass -Force +Invoke-WebRequest -Uri 'https://raw.githubusercontent.com/mongodb/kingfisher/main/scripts/install-kingfisher.ps1' -OutFile install-kingfisher.ps1 +./install-kingfisher.ps1 +``` + +You can provide a custom destination using the `-InstallDir` parameter: + +```powershell +./install-kingfisher.ps1 -InstallDir 'C:\Tools\Kingfisher' +``` +
+ + +### Compile +You may compile for your platform via `make` + +
+ +```bash +# NOTE: Requires Docker +make linux + +# macOS --- must build from a macOS host +make darwin + +# Windows x64 --- requires building from a Windows host with Visual Studio installed +./buildwin.bat -force +``` + +```bash +# Build all targets +make linux-all # builds both x64 and arm64 +make darwin-all # builds both x64 and arm64 +make all # builds for every OS and architecture supported +``` + +
+ +### Docker Run Kingfisher in Docker + +Run the dockerized Kingfisher container + +
+ +```bash +# GitHub Container Registry +docker run --rm ghcr.io/mongodb/kingfisher:latest --version + +# Scan the current working directory +# (mounts your code at /src and scans it) +docker run --rm \ + -v "$PWD":/src \ + ghcr.io/mongodb/kingfisher:latest scan /src + + +# Scan while providing a GitHub token +# Mounts your working dir at /proj and passes in the token: +docker run --rm \ + -e KF_GITHUB_TOKEN=ghp_… \ + -v "$PWD":/proj \ + ghcr.io/mongodb/kingfisher:latest \ + scan --git-url https://github.com/org/private_repo.git + +# Scan an S3 bucket +# Credentials can come from KF_AWS_KEY/KF_AWS_SECRET, --role-arn, or --profile +docker run --rm \ + -e KF_AWS_KEY=AKIA... \ + -e KF_AWS_SECRET=g5nYW... \ + ghcr.io/mongodb/kingfisher:latest \ + scan s3 bucket-name + + +# Scan and write a JSON report locally +# Here we: +# 1. Mount $PWD → /proj +# 2. Tell Kingfisher to write findings.json inside /proj/reports +# 3. Ensure ./reports exists on your host so Docker can mount it +mkdir -p reports + +# run and output into host’s ./reports directory +docker run --rm \ + -v "$PWD":/proj \ + ghcr.io/mongodb/kingfisher:latest \ + scan /proj \ + --format json \ + --output /proj/reports/findings.json + + +# Tip: you can combine multiple mounts if you prefer separating source vs. output: +# Here /src is read‑only, and /out holds your generated reports +docker run --rm \ + -v "$PWD":/src:ro \ + -v "$PWD/reports":/out \ + ghcr.io/mongodb/kingfisher:latest \ + scan /src \ + --format json \ + --output /out/findings.json + +``` + +
+ +# 🔐 Detection Rules at a Glance + +Kingfisher ships with [hundreds of rules](/data/rules/) that cover everything from classic cloud keys to the latest AI SaaS tokens. Below is an overview: + +| Category | What we catch | +|----------|---------------| +| **AI SaaS APIs** | OpenAI, Anthropic, Google Gemini, Cohere, Mistral, Stability AI, Replicate, xAI (Grok), Ollama, Langchain, Perplexity, Weights & Biases, Cerebras, Friendli, Fireworks.ai, NVIDIA NIM, together.ai, Zhipu, and more | +| **Cloud Providers** | AWS, Azure, GCP, Alibaba Cloud, DigitalOcean, IBM Cloud, Cloudflare, and more | +| **Dev & CI/CD** | GitHub/GitLab tokens, CircleCI, TravisCI, TeamCity, Docker Hub, npm, PyPI, and more | +| **Messaging & Comms** | Slack, Discord, Microsoft Teams, Twilio, Mailgun, SendGrid, Mailchimp, and more | +| **Databases & Data Ops** | MongoDB Atlas, PlanetScale, Postgres DSNs, Grafana Cloud, Datadog, Dynatrace, and more | +| **Payments & Billing** | Stripe, PayPal, Square, GoCardless, and more | +| **Security & DevSecOps** | Snyk, Dependency-Track, CodeClimate, Codacy, OpsGenie, PagerDuty, and more | +| **Misc. SaaS & Tools** | 1Password, Adobe, Atlassian/Jira, Asana, Netlify, Baremetrics, and more | + +## 📝 Write Custom Rules! + +Kingfisher ships with hundreds of rules with HTTP and service‑specific validation checks (AWS, Azure, GCP, etc.) to confirm if a detected string is a live credential. + +However, you may want to add your own custom rules, or modify a detection to better suit your needs / environment. + +First, review [docs/RULES.md](/docs/RULES.md) to learn how to create custom Kingfisher rules. + +### Pattern requirements and placeholder filtering + +Every rule can declare optional `pattern_requirements` to enforce additional character checks after a regex matches. Each field +is independent: + +- `min_digits`, `min_uppercase`, `min_lowercase`, and `min_special_chars` enforce complexity thresholds. +- `special_chars` lets you override the set of characters counted as "special" when `min_special_chars` is used. +- `ignore_if_contains` lists case-insensitive substrings that should cause a match to be discarded (for example, to drop + `test`, `demo`, or `localhost` values). + +When a match is skipped because of `ignore_if_contains`, Kingfisher logs the event at the `DEBUG` level alongside the rule that +was evaluated. If you need to keep those matches for a particular scan, pass `--no-ignore-if-contains` to `kingfisher scan` to +disable the substring filter without editing any rule files. + +Once you've done that, you can provide your custom rules (defined in a YAML file) and provide it to Kingfisher at runtime --- no recompiling required! + +# 🎉 Usage + +## Basic Examples + +> **Note**  `kingfisher scan` detects whether the input is a Git repository or a plain directory, no extra flags required. + +### Scan with secret validation + +```bash +kingfisher scan /path/to/code +## NOTE: This path can refer to: +# 1. a local git repo +# 2. a directory with many git repos +# 3. or just a folder with files and subdirectories + +## To explicitly prevent scanning git commit history add: +# `--git-history=none` +``` + +### Scan a directory containing multiple Git repositories + +```bash +kingfisher scan /projects/mono‑repo‑dir +``` + +### Scan a Git repository without validation + +```bash +kingfisher scan ~/src/myrepo --no-validate +``` + +### Display only secrets confirmed active by third‑party APIs + +```bash +kingfisher scan /path/to/repo --only-valid +``` + +### Output JSON and capture to a file + +```bash +kingfisher scan . --format json | tee kingfisher.json +``` + +### Output SARIF directly to disk + +```bash +kingfisher scan /path/to/repo --format sarif --output findings.sarif +``` + +### Pipe any text directly into Kingfisher by passing `-` + +```bash +cat /path/to/file.py | kingfisher scan - + +``` + +### Limit maximum file size scanned (`--max-file-size`) + +By default, Kingfisher skips files larger than **256 MB**. You can raise or lower this cap per run with `--max-file-size`, which takes a value in **megabytes**. + +```bash +# Scan files up to 500 mb in size +kingfisher scan /some/file --max-file-size 500 +``` + +### Scan using a rule _family_ with one flag + +_(prefix matching: `--rule kingfisher.aws` loads `kingfisher.aws.*`)_ + +```bash +# Only apply AWS-related rules (kingfisher.aws.1 + kingfisher.aws.2) +kingfisher scan /path/to/repo --rule kingfisher.aws +``` + +### Display rule performance statistics + +```bash +kingfisher scan /path/to/repo --rule-stats +``` + +### Scan while ignoring likely test files + +`--exclude` skips any file or directory whose path matches this glob pattern (repeatable, uses gitignore-style syntax, case sensitive) + +```bash +# Scan source but skip likely unit / integration tests +kingfisher scan ./my-project \ + --exclude='[Tt]est' \ + --exclude='spec' \ + --exclude='[Ff]ixture' \ + --exclude='example' \ + --exclude='sample' +``` + +### Exclude specific paths +```bash +# Skip all Python files and any directory named tests +kingfisher scan ./my-project \ + --exclude '*.py' \ + --exclude '[Tt]ests' +``` + +### Scan changes in CI pipelines + +Limit scanning to the delta between your default branch and a pull request branch by combining `--since-commit` with `--branch` (defaults to `HEAD`). This only scans files that differ between the two references, which keeps CI runs fast while still blocking new secrets. + +Use `--branch-root-commit` alongside `--branch` when you need to include a specific commit (and everything after it) in a diff-focused scan without re-examining earlier history. Provide the branch tip (or other comparison ref) via `--branch`, and pass the commit or merge-base you want to include with `--branch-root-commit`. If you omit `--branch-root-commit`, you can still enable `--branch-root` to fall back to treating the `--branch` ref itself as the inclusive root for backwards compatibility. This is especially useful in long-lived branches where you want to resume scanning from a previous review point or from the commit where a hotfix forked. + +> **How is this different from `--since-commit`?** +> `--since-commit` computes a diff between the branch tip and another ref, so it only inspects files that changed between those two points in history. `--branch-root-commit` rewinds to the parent of the commit you provide and then scans everything introduced from that commit forward, even if the files are unchanged relative to another baseline. Reach for `--since-commit` to keep CI scans fast by checking only the latest delta, and use `--branch-root-commit` when you want to re-audit the full contents of a branch starting at a specific commit. + +```bash +kingfisher scan . \ + --since-commit origin/main \ + --branch "$CI_BRANCH" +``` + +Another example: +```bash +cd /tmp +git clone https://github.com/micksmix/SecretsTest.git + +cd /tmp/SecretsTest +git checkout feature-1 +# +# scan diff between main and feature-1 branch +kingfisher scan /tmp/SecretsTest --branch feature-1 \ + --since-commit=$(git -C /tmp/SecretsTest merge-base main feature-1) +# +# scan only a specific commit +kingfisher scan /tmp/SecretsTest \ + --branch baba6ccb453963d3f6136d1ace843e48d7007c3f +# +# scan feature-1 starting at a specific commit (inclusive) +kingfisher scan /tmp/SecretsTest --branch feature-1 \ + --branch-root-commit baba6ccb453963d3f6136d1ace843e48d7007c3f +# +# scan feature-1 starting from the commit where the branch diverged from main +kingfisher scan /tmp/SecretsTest --branch feature-1 \ + --branch-root-commit $(git -C /tmp/SecretsTest merge-base main feature-1) +# +# scan from a hotfix commit that should be re-checked before merging +HOTFIX_COMMIT=$(git -C /tmp/SecretsTest rev-parse hotfix~1) +kingfisher scan /tmp/SecretsTest --branch hotfix \ + --branch-root-commit "$HOTFIX_COMMIT" +``` + +When the branch under test is already checked out, `--branch HEAD` or omitting `--branch` entirely is sufficient. Kingfisher exits with `200` when any findings are discovered and `205` when validated secrets are present, allowing CI jobs to fail automatically if new credentials slip in. + +> **Tip:** You can point Kingfisher at a local working tree and scan another branch or commit without changing checkouts. The CLI now resolves repositories from their worktree roots, so commands like the following work without needing to pass the `.git` directory explicitly: + +```bash +kingfisher scan /path/to/local/repo --branch +kingfisher scan C:\\src\\repo --branch +``` + +The same diff-focused workflow works when cloning repositories on the fly with `--git-url`. Kingfisher automatically tries remote-tracking names like `origin/main` and `origin/feature-1`, so you can target the branches involved in a pull request without performing a local checkout first. + +```bash +kingfisher scan \ + --git-url https://github.com/org/repo.git \ + --since-commit main \ + --branch development +``` + +When `--since-commit` is omitted, specifying `--branch` scans the requested ref directly. This makes it easy to analyze a feature branch without checking it out locally. + +```bash +# Scan a branch from an existing checkout +kingfisher scan ~/tmp/repo --branch feature-123 + +# Or scan a branch when cloning on the fly +kingfisher scan \ + --git-url https://github.com/org/repo.git \ + --branch origin/feature-123 +``` + +In CI systems that expose the base and head commits explicitly, you can pass those SHAs directly while still using `--git-url`: + +```bash +kingfisher scan \ + --git-url git@github.com:org/repo.git \ + --since-commit "$BASE_COMMIT" \ + --branch "$PR_HEAD_COMMIT" +``` + +If you want to know which files are being skipped, enable verbose debugging (-v) when scanning, which will report any files being skipped by the baseline file (or via --exclude): + +```bash +# Skip all Python files and any directory named tests, and report to stderr any skipped files +kingfisher scan ./my-project \ + --exclude '*.py' \ + --exclude tests \ + -v +``` + +## GitHub Scanning an AWS S3 Bucket +You can scan S3 objects directly: + +```bash +kingfisher scan s3 bucket-name [--prefix path/] +``` + +Credential resolution happens in this order: + +1. `KF_AWS_KEY` and `KF_AWS_SECRET` environment variables +2. `--profile` pointing to a profile in `~/.aws/config` (works with AWS SSO) +3. anonymous access for public buckets + +If `--role-arn` is supplied, the credentials from steps 1–2 are used to assume that role. + +Examples + +```bash +# using explicit keys +export KF_AWS_KEY=AKIA... +export KF_AWS_SECRET=g5nYW... +kingfisher scan s3 some-example-bucket + +# Above can also be run as: +KF_AWS_KEY=AKIA... KF_AWS_SECRET=g5nYW... kingfisher scan s3 some-example-bucket + +# using a local profile (e.g., SSO) that exists in your AWS profile (~/.aws/config) +kingfisher scan s3 some-example-bucket --profile default + +# anonymous scan of a bucket, while providing an object prefix to only scan subset of the s3 bucket +kingfisher scan s3 awsglue-datasets \ + --prefix examples/us-legislators/all + +# assuming a role when scanning +kingfisher scan s3 some-example-bucket \ + --role-arn arn:aws:iam::123456789012:role/MyRole + +# anonymous scan of a public bucket +kingfisher scan s3 some-example-bucket +``` + +Docker example: + +```bash +docker run --rm \ + -e KF_AWS_KEY=AKIA... \ + -e KF_AWS_SECRET=g5nYW... \ + ghcr.io/mongodb/kingfisher:latest \ + scan s3 bucket-name +``` + +## Google Cloud Storage Scanning a Google Cloud Storage Bucket + +Use the `gcs` scan subcommand to stream objects directly from Google Cloud Storage. Authentication +uses Application Default Credentials, so you can provide a service-account JSON file via the +`GOOGLE_APPLICATION_CREDENTIALS` environment variable or by passing `--service-account`. Public +buckets work without credentials. + +```bash +kingfisher scan gcs bucket-name + +# scan a sub-tree inside the bucket +kingfisher scan gcs bucket-name --prefix path/to/data/ + +# supply a service-account key explicitly +kingfisher scan gcs bucket-name --service-account /path/to/key.json +``` + +Functional example: +```bash +kingfisher scan gcs cloud-samples-data --prefix "storage/" +``` + + +## Docker Scanning Docker Images + +Kingfisher will first try to use any locally available image, then fall back to pulling via OCI. + +Authentication happens *in this order*: + +1. **`KF_DOCKER_TOKEN`** env var + - If it contains `user:pass`, it’s used as Basic auth + - Otherwise it’s sent as a Bearer token +2. **Docker CLI credentials** + - Checks `credHelpers` (per-registry) and `credsStore` in `~/.docker/config.json`. + - Falls back to the legacy `auths` → `auth` (base64) entries. +3. **Anonymous** (no credentials) + + +```bash +# 1) Scan public or already-pulled image +kingfisher scan docker ghcr.io/owasp/wrongsecrets/wrongsecrets-master:latest-master + +# 2) For private registries, explicitly set KF_DOCKER_TOKEN: +# - Basic auth: "user:pass" +# - Bearer only: "TOKEN" +export KF_DOCKER_TOKEN="AWS:$(aws ecr get-login-password --region us-east-1)" +kingfisher scan docker some-private-registry.dkr.ecr.us-east-1.amazonaws.com/base/amazonlinux2023:latest + +# 3) Or rely on your Docker CLI login/keychain: +# (e.g. aws ecr get-login-password … | docker login …) +kingfisher scan docker private.registry.example.com/my-image:tag +``` + +> **Deprecated** +> Legacy scan flags such as `--github-user`, `--gitlab-group`, +> `--bitbucket-workspace`, `--azure-organization`, `--huggingface-user`, +> `--slack-query`, `--jira-url`, `--confluence-url`, `--s3-bucket`, +> `--gcs-bucket`, and `--docker-image` still work for now, but they trigger a +> warning and will be removed in a future release. Migrate to the +> `kingfisher scan ` subcommands below to future-proof your automations. + +## GitHub Scanning GitHub + +### Scan GitHub organization (requires `KF_GITHUB_TOKEN`) + +```bash +kingfisher scan github --organization my-org +``` + +### Skip specific GitHub repositories during enumeration + +Repeat `--github-exclude` for every repository you want to ignore when scanning +users or organizations. You can provide exact repositories like +`OWNER/REPO` or gitignore-style glob patterns such as `owner/*-archive` +(matching is case-insensitive). + +```bash +kingfisher scan github --organization my-org \ + --github-exclude my-org/huge-repo \ + --github-exclude my-org/*-archive +``` + +### Scan remote GitHub repository + +`--git-url` clones the repository and scans its files and history. To also inspect +related server-side data, supply `--repo-artifacts`. This flag pulls down the +repository's issues (including pull requests), wiki, and any public gists owned by +the repository owner and scans them for secrets. Fetching these extras counts +against API rate limits and private artifacts require a `KF_GITHUB_TOKEN`. + +> **Why does `--git-url` sometimes report fewer findings than scanning a local checkout?**. +> +> Remote clones created via `--git-url` default to `--mirror`/bare mode so Kingfisher only +> reads the Git history. When you point Kingfisher at an existing working tree (for example +> `kingfisher scan ./repo`), it enumerates both the filesystem contents *and* the Git +> history. Any secrets that are present in the checked-out files therefore appear twice: +> once from the working tree path and once from the commit where the secret entered the +> history. To replicate the remote behavior locally, either scan a bare clone or disable +> history scanning with `--git-history none` when targeting a working tree. + + +```bash +# Scan the repository only +kingfisher scan --git-url https://github.com/org/repo.git + +# Include issues, wiki, and owner gists +kingfisher scan --git-url https://github.com/org/repo.git --repo-artifacts + +# Private repositories or artifacts +KF_GITHUB_TOKEN="ghp_…" kingfisher scan --git-url https://github.com/org/private_repo.git --repo-artifacts +``` + +--- + +## GitLab Scanning GitLab + +### Scan GitLab group (requires `KF_GITLAB_TOKEN`) + +```bash +kingfisher scan gitlab --group my-group +# include repositories from all nested subgroups +kingfisher scan gitlab --group my-group --include-subgroups +``` + +### Scan GitLab user + +```bash +kingfisher scan gitlab --user johndoe +``` + +### Skip specific GitLab projects during enumeration + +Repeat `--gitlab-exclude` for every project path you want to ignore when scanning +users or groups. Specify project paths as `group/project` (case-insensitive) or +use gitignore-style glob patterns like `group/**/archive-*` to drop families of +projects across nested subgroups. + +```bash +kingfisher scan gitlab --group my-group \ + --gitlab-exclude my-group/huge-project \ + --gitlab-exclude my-group/**/archive-* +``` + +### Scan remote GitLab repository by URL + +`--git-url` by itself clones the project repository. To include server-side +artifacts owned by the project, add `--repo-artifacts`. Kingfisher will retrieve +the project's issues, wiki, and snippets and scan them for secrets. These extra +requests may take longer and require a `KF_GITLAB_TOKEN` for private projects. + +```bash +# Scan the repository only +kingfisher scan --git-url https://gitlab.com/group/project.git + +# Include issues, wiki, and snippets +kingfisher scan --git-url https://gitlab.com/group/project.git --repo-artifacts + +# Private projects or artifacts +KF_GITLAB_TOKEN="glpat-…" kingfisher scan --git-url https://gitlab.com/group/private_project.git --repo-artifacts +``` + +### List GitLab repositories + +```bash +kingfisher scan gitlab --group my-group --list-only +# include repositories from all nested subgroups +kingfisher scan gitlab --group my-group --include-subgroups --list-only +# skip specific projects when listing or scanning (supports glob patterns) +kingfisher scan gitlab --group my-group --gitlab-exclude my-group/**/legacy-* --list-only +``` +## Azure Repos Scanning Azure Repos + +### Scan Azure Repos organization or collection (requires `KF_AZURE_TOKEN` or `KF_AZURE_PAT`) + +```bash +kingfisher scan azure --organization my-org + +# Azure Repos Server example +KF_AZURE_PAT="pat" kingfisher scan azure --organization DefaultCollection --azure-base-url https://ado.internal.example/tfs/ +``` + +### Scan specific Azure Repos projects + +Projects are specified as `ORGANIZATION/PROJECT`. Repeat the flag for multiple projects. + +```bash +kingfisher scan azure --project my-org/payments \ + --project my-org/core-platform +``` + +### Skip specific Azure repositories during enumeration + +Repeat `--azure-exclude` to ignore repositories when scanning organizations or projects. +Use identifiers like `ORGANIZATION/PROJECT/REPOSITORY`. Repositories that share the same +name as their project can be excluded with `ORGANIZATION/PROJECT`, and gitignore-style +patterns such as `my-org/*/archive-*` are also supported. + +```bash +kingfisher scan azure --organization my-org \ + --azure-exclude my-org/payments/legacy-service \ + --azure-exclude my-org/**/archive-* +``` + +### List Azure repositories + +```bash +kingfisher scan azure --organization my-org --list-only +# list repositories for specific projects +kingfisher scan azure --project my-org/app --project my-org/api --list-only +# skip specific repositories while listing (supports glob patterns) +kingfisher scan azure --organization my-org --azure-exclude my-org/**/experimental-* --list-only +``` +## Gitea Scanning Gitea + +### Scan Gitea organization (requires `KF_GITEA_TOKEN`) + +```bash +kingfisher scan gitea --organization my-org +# self-hosted example +KF_GITEA_TOKEN="gtoken" kingfisher scan gitea --organization platform --gitea-api-url https://gitea.internal.example/api/v1/ +``` + +### Scan Gitea user + +```bash +kingfisher scan gitea --user johndoe +``` + +### Skip specific Gitea repositories during enumeration + +Repeat `--gitea-exclude` for each repository you want to ignore when scanning users +or organizations. Accepts `owner/repo` identifiers or gitignore-style glob patterns +like `team/**/archive-*`. + +```bash +kingfisher scan gitea --organization my-org \ + --gitea-exclude my-org/legacy-repo \ + --gitea-exclude my-org/**/archive-* +``` + +### Scan remote Gitea repository by URL + +`--git-url` clones the repository and scans its history. Adding `--repo-artifacts` +also clones the repository wiki if one exists. Private repositories and wikis +require `KF_GITEA_TOKEN` (and `KF_GITEA_USERNAME` when cloning via HTTPS). + +```bash +# Scan the repository only +kingfisher scan --git-url https://gitea.com/org/repo.git + +# Include the repository wiki (if present) +KF_GITEA_TOKEN="gtoken" KF_GITEA_USERNAME="org" \ + kingfisher scan --git-url https://gitea.com/org/repo.git --repo-artifacts +``` + +### List Gitea repositories + +```bash +kingfisher scan gitea --organization my-org --list-only +# enumerate every organization visible to the authenticated user +KF_GITEA_TOKEN="gtoken" kingfisher scan gitea --all-gitea-organizations --list-only +# self-hosted example +KF_GITEA_TOKEN="gtoken" kingfisher scan gitea --user johndoe --gitea-api-url https://gitea.internal.example/api/v1/ --list-only +``` +## Bitbucket Scanning Bitbucket +### Scan Bitbucket workspace + +```bash +kingfisher scan bitbucket --workspace my-team +# include Bitbucket Cloud repositories from every accessible workspace +KF_BITBUCKET_USERNAME="$USER" KF_BITBUCKET_APP_PASSWORD="$APP_PASSWORD" \ + kingfisher scan bitbucket --all-workspaces +``` + +### Scan Bitbucket user + +```bash +kingfisher scan bitbucket --user johndoe +``` + +### Skip specific Bitbucket repositories during enumeration + +Use `--bitbucket-exclude` to ignore repositories while scanning users, workspaces, +or projects. Patterns accept either `owner/repo` (case-insensitive) or +gitignore-style globs such as `workspace/**/archive-*`. + +```bash +kingfisher scan bitbucket --workspace my-team \ + --bitbucket-exclude my-team/legacy-repo \ + --bitbucket-exclude my-team/**/archive-* +``` + +### Scan remote Bitbucket repository by URL + +`--git-url` clones the repository and scans its files and history. To inspect +Bitbucket artifacts such as issues, add `--repo-artifacts`. Private artifacts +require credentials (see [Authenticate to Bitbucket](#authenticate-to-bitbucket)). + +```bash +# Scan the repository only +kingfisher scan --git-url https://bitbucket.org/hashashash/secretstest.git + +# Include repository issues +KF_BITBUCKET_USERNAME="user" \ +KF_BITBUCKET_APP_PASSWORD="app-password" \ + kingfisher scan --git-url https://bitbucket.org/workspace/project.git --repo-artifacts +``` + +### List Bitbucket repositories + +```bash +kingfisher scan bitbucket --workspace my-team --list-only +# enumerate all accessible workspaces or projects +KF_BITBUCKET_USERNAME="$USER" KF_BITBUCKET_APP_PASSWORD="$APP_PASSWORD" \ + kingfisher scan bitbucket --all-workspaces --list-only +# filter out repositories using glob patterns +kingfisher scan bitbucket --workspace my-team --bitbucket-exclude my-team/**/experimental-* --list-only +``` + +### Authenticate to Bitbucket + +Kingfisher supports Bitbucket Cloud and Bitbucket Server credentials: + +- **App password or server token** – set `KF_BITBUCKET_USERNAME` and either + `KF_BITBUCKET_APP_PASSWORD`, `KF_BITBUCKET_TOKEN`, or + `KF_BITBUCKET_PASSWORD`. +- **OAuth/PAT token** – set `KF_BITBUCKET_OAUTH_TOKEN`. + +These credentials match the options described in the [ghorg setup +guide](https://github.com/gabrie30/ghorg/blob/master/README.md#bitbucket-setup). + +### Self-hosted Bitbucket Server + +Use `--bitbucket-api-url` to point Kingfisher at your server's REST endpoint, for example +`https://bitbucket.example.com/rest/api/1.0/`. Provide credentials with +`KF_BITBUCKET_USERNAME` plus either `KF_BITBUCKET_TOKEN` or `KF_BITBUCKET_PASSWORD`, +and pass `--ignore-certs` when connecting to HTTP or otherwise insecure instances. +## Hugging Face Scanning Hugging Face + +Hugging Face hosts git repositories for models, datasets, and Spaces. Kingfisher can enumerate and scan all three resource types. + +### Scan Hugging Face user + +```bash +kingfisher scan huggingface --user +``` + +### Scan Hugging Face organization + +```bash +kingfisher scan huggingface --organization +``` + +### Scan specific Hugging Face resources + +Scan individual repositories by ID (owner/name) or by passing the full HTTPS URL: + +```bash +kingfisher scan huggingface --model +kingfisher scan huggingface --dataset https://huggingface.co/datasets// +kingfisher scan huggingface --space +``` + +Use `--huggingface-exclude` to omit results returned by user or organization enumeration. Prefix values with `model:`, `dataset:`, or `space:` when you only want to skip a specific resource type. + +### List Hugging Face repositories + +```bash +kingfisher scan huggingface --user --list-only +``` + +### Authenticate to Hugging Face + +Private repositories require an access token provided through the `KF_HUGGINGFACE_TOKEN` environment variable. For git authentication the helper also honours `KF_HUGGINGFACE_USERNAME` (default `hf_user`). + +## Jira Scanning Jira + +### Scan Jira issues matching a JQL query + +```bash +KF_JIRA_TOKEN="token" kingfisher scan jira --url https://jira.company.com \ + --jql "project = TEST AND status = Open" \ + --max-results 500 +``` + +### Scan the last 1,000 Jira issues: +```bash +KF_JIRA_TOKEN="token" kingfisher scan jira --url https://jira.mongodb.org \ + --jql 'ORDER BY created DESC' \ + --max-results 1000 +``` + +## Confluence Scanning Confluence +### Scan Confluence pages matching a CQL query + +```bash +# Bearer token +KF_CONFLUENCE_TOKEN="token" kingfisher scan confluence --url https://confluence.company.com \ + --cql "label = secret" \ + --max-results 500 + +# Basic auth with username and token +KF_CONFLUENCE_USER="user@example.com" KF_CONFLUENCE_TOKEN="token" \ + kingfisher scan confluence --url https://confluence.company.com \ + --cql "text ~ 'password'" \ + --max-results 500 +``` + +Use the base URL of your Confluence site for `--confluence-url`. Kingfisher +automatically adds `/rest/api` to the end, so `https://example.com/wiki` and +`https://example.com` both work depending on your server configuration. + +Generate a personal access token and set it in the `KF_CONFLUENCE_TOKEN` environment variable. By default, Kingfisher sends the token as a bearer token in the `Authorization` header. + +To use basic authentication instead, also set `KF_CONFLUENCE_USER` to your Confluence email address; Kingfisher will then send the username and `KF_CONFLUENCE_TOKEN` as a Basic auth header. If the server responds with a redirect to a login page, the credentials are invalid or lack the required permissions. + +## Slack Scanning Slack +### Scan Slack messages matching a search query + +```bash +KF_SLACK_TOKEN="xoxp-1234..." kingfisher scan slack "from:username has:link" \ + --max-results 1000 + +KF_SLACK_TOKEN="xoxp-1234..." kingfisher scan slack "akia" \ + --max-results 1000 +``` +*The Slack token must be a user token with the `search:read` scope. Bot tokens (those beginning with `xoxb-`) cannot call the Slack search API.* + +## Environment Variables for Tokens + +| Variable | Purpose | +| ----------------- | ---------------------------- | +| `KF_GITHUB_TOKEN` | GitHub Personal Access Token | +| `KF_GITLAB_TOKEN` | GitLab Personal Access Token | +| `KF_GITEA_TOKEN` | Gitea Personal Access Token | +| `KF_GITEA_USERNAME` | Username for private Gitea clones (used with `KF_GITEA_TOKEN`) | +| `KF_AZURE_TOKEN` / `KF_AZURE_PAT` | Azure Repos Personal Access Token | +| `KF_AZURE_USERNAME` | Username to use with Azure Repos PATs (defaults to `pat` when unset) | +| `KF_BITBUCKET_USERNAME` | Bitbucket username for basic authentication | +| `KF_BITBUCKET_APP_PASSWORD` / `KF_BITBUCKET_TOKEN` | Bitbucket app password or server token | +| `KF_BITBUCKET_OAUTH_TOKEN` | Bitbucket OAuth or PAT token | +| `KF_HUGGINGFACE_TOKEN` | Hugging Face access token for API enumeration and git cloning | +| `KF_HUGGINGFACE_USERNAME` | Optional username for Hugging Face git operations (defaults to `hf_user`) | +| `KF_JIRA_TOKEN` | Jira API token | +| `KF_CONFLUENCE_TOKEN` | Confluence API token | +| `KF_SLACK_TOKEN` | Slack API token | +| `KF_DOCKER_TOKEN` | Docker registry token (`user:pass` or bearer token). If unset, credentials from the Docker keychain are used | +| `KF_AWS_KEY` and `KF_AWS_SECRET` | AWS Credentials to use with S3 bucket scanning | + +Set them temporarily per command: + +```bash +KF_GITLAB_TOKEN="glpat-…" kingfisher scan gitlab --group my-group +``` + +Or export for the session: + +```bash +export KF_GITLAB_TOKEN="glpat-…" +``` + +To authenticate Jira requests: +```bash +export KF_JIRA_TOKEN="token" +``` + +To authenticate Confluence requests: +```bash +export KF_CONFLUENCE_TOKEN="token" +``` + +_If no token is provided Kingfisher still works for public repositories._ + +--- + +## Exit Codes + +| Code | Meaning | +| ---- | ----------------------------- | +| 0 | No findings | +| 200 | Findings discovered | +| 205 | Validated findings discovered | + +## Update Checks + +Kingfisher automatically queries GitHub for a newer release when it starts and tells you whether an update is available. + +- **Hands-free updates** – Add `--self-update` to any Kingfisher command + + * If a newer version exists, Kingfisher will download it, replace the running binary, and re-launch itself with the **exact same arguments**. + * If the update fails or no newer release is found, the current run proceeds as normal + +- **Manual update** – Run `kingfisher self-update` to update the binary without scanning + +- **Disable version checks** – Pass `--no-update-check` to skip both the startup and shutdown checks entirely + +# 🤓 Advanced Options + +## Build a Baseline / Detect New Secrets + +There are situations where a repository already contains checked‑in secrets, but you want to ensure no **new** secrets are introduced. A baseline file lets you document the known findings so future scans only report anything that is not already in that list. + +The easiest way to create a baseline is to run a normal scan with the `--manage-baseline` flag (typically at a low confidence level to capture all potential matches): + +```bash +kingfisher scan /path/to/code \ + --confidence low \ + --manage-baseline \ + --baseline-file ./baseline-file.yml +``` + +`--manage-baseline` automatically enables `--no-dedup` so the baseline captures every individual occurrence. + +Use the same YAML file with the `--baseline-file` option on future scans to hide all recorded findings: + +```bash +kingfisher scan /path/to/code \ + --baseline-file /path/to/baseline-file.yaml +``` + +Running the scan again with `--manage-baseline` refreshes the baseline by adding new findings and pruning entries for secrets that no longer appear. See [docs/BASELINE.md](docs/BASELINE.md) for full detail. + +## List Builtin Rules + +```bash +kingfisher rules list +``` + +## To scan using **only** your own `my_rules.yaml` you could run: + +```bash +kingfisher scan \ + --load-builtins=false \ + --rules-path path/to/my_rules.yaml \ + ./src/ +``` + +## To add your rules alongside the built‑ins: + +```bash +kingfisher scan \ + --rules-path ./custom-rules/ \ + --rules-path my_rules.yml \ + ~/path/to/project-dir/ +``` + +## Other Examples + +```bash +# Check custom rules - this ensures all regular expressions compile, and can match the rule's `examples` in the YML file +kingfisher rules check --rules-path ./my_rules.yml + +# List GitHub repos +kingfisher scan github --user my-user --list-only +kingfisher scan github --organization my-org --list-only +# Skip specific repositories when listing or scanning (supports glob patterns) +kingfisher scan github --organization my-org --github-exclude my-org/*-archive --list-only + +``` + +## Customize the HTTP User-Agent + +Kingfisher identifies its HTTP requests with a user-agent that includes the binary name and version followed by a browser-style +string. Some environments require extra context, such as a contact address, a change-ticket number, or a temporary test label. +Use the global `--user-agent-suffix` flag to append this information between the Kingfisher identifier and the browser portion: + +```bash +# Attach a contact email to all outbound validation requests +kingfisher --user-agent-suffix "contact=security@example.com" scan path/ + +# Label a one-off experiment +kingfisher --user-agent-suffix "Sept 2025 testing" scan github --user my-user --list-only +``` + +When omitted, Kingfisher defaults to `kingfisher/ Mozilla/5.0 ...`. The suffix is trimmed; passing an empty string +leaves the default unchanged. + +## Notable Scan Options + +- `--no-dedup`: Report every occurrence of a finding (disable the default de-duplicate behavior) +- `--no-base64`: By default, Kingfisher finds and decodes base64 blobs and scans them for secrets. This adds a slight performance overhead; use this flag to disable +- `--confidence `: (low|medium|high) +- `--min-entropy `: Override default threshold +- `--no-binary`: Skip binary files +- `--no-extract-archives`: Do not scan inside archives +- `--extraction-depth `: Specifies how deep nested archives should be extracted and scanned (default: 2) +- `--redact`: Replaces discovered secrets with a one-way hash for secure output +- `--exclude `: Skip any file or directory whose path matches this glob pattern (repeatable, uses gitignore-style syntax, case sensitive) +- `--baseline-file `: Ignore matches listed in a baseline YAML file +- `--manage-baseline`: Create or update the baseline file with current findings (automatically enables `--no-dedup`) +- `--skip-regex `: Ignore findings whose text matches this regex (repeatable) +- `--skip-word `: Ignore findings containing this case-insensitive word (repeatable) +- `--skip-aws-account `: Skip live AWS validation for findings tied to the specified AWS account number (repeatable, accepts comma-separated lists) +- `--skip-aws-account-file `: Load AWS account numbers to skip from a file (one account per line; `#` comments allowed) +- `--ignore-comment `: Honor additional inline directives from other scanners (repeatable; e.g. `--ignore-comment "gitleaks:allow"`) +- `--no-ignore`: Disable inline directives entirely so every match is reported +- `--no-ignore-if-contains`: Ignore the `ignore_if_contains` filter in rules so placeholder words still produce findings + +## Understanding `--confidence` + +The `--confidence` flag sets a minimum confidence threshold, not an exact match. + +- If you pass `--confidence medium`, findings with **medium and higher** confidence (medium + high) will be included. +- If you pass `--confidence low`, you’ll see **all levels** (low, medium, high). + + +### Ignore known false positives + +Use `--skip-regex` and `--skip-word` to suppress findings you know are benign. Both flags may be provided multiple times and are tested against the secret value **and** the full match context. + +With `--skip-regex`, these should be Rust compatible regular expressions, which you can test out at [regex101](https://regex101.com) + +```bash +# Skip any finding where the finding mentions TEST_KEY +kingfisher scan --skip-regex '(?i)TEST_KEY' path/ + +# Skip findings that contain the word "dummy" anywhere in the match +kingfisher scan --skip-word dummy path/ + +# Combine multiple patterns +kingfisher scan \ + --skip-regex 'AKIA[0-9A-Z]{16}' \ + --skip-word placeholder \ + --skip-word dummy \ + path/ +``` + +If a `--skip-regex` regular expression fails to compile, the scan aborts with an error so that typos are caught early. + +### Skip Canary Tokens (AWS) + +Canary/honey tokens are intentionally leaked credentials used to catch misuse. Kingfisher can **recognize and skip** known AWS canary accounts so hygiene scans don’t set off alerts. + +**How to skip** +Pass the 12-digit AWS account IDs for your canaries via `--skip-aws-account` (comma-separated) or `--skip-aws-account-file` (one ID per line; blank lines and `#` comments allowed). Kingfisher also ships with a **pre-seeded (but not exhaustive)** list of Thinkst Canary account IDs used by canarytokens.org, so many are skipped automatically. + +```bash +kingfisher scan /path/to/code \ + --skip-aws-account "171436882533,534261010715" + +# or combine preloaded canary IDs with a just-created decoy account +printf '999900001111 \n534261010715' > /tmp/canary_accounts.txt + +kingfisher scan /path/to/repo \ + --skip-aws-account-file /tmp/canary_accounts.txt + +``` + +**What you’ll see** +Findings tied to a skip-listed account report `Validation: Not Attempted` and note in the `Response:` that the entry came from the skip list: + +```bash +AWS SECRET ACCESS KEY => [KINGFISHER.AWS.2] + |Finding.......: + |Fingerprint...: 2141074333616819500 + |Confidence....: medium + |Entropy.......: 5.00 + |Validation....: Not Attempted + |__Response....: (skip list entry) AWS validation not attempted for account 171436882533. + |Language......: Unknown + |Line Num......: 21 + |Path..........: /tmp/test_canary_accounts.log +``` + +**Why this matters** +Skipping prevents noisy tripwires in prod telemetry while keeping the status explicit—“Not Attempted” isn’t a pass. If needed, verify these credentials out-of-band or with a safe, non-triggering method. + + +#### Common CLI flows + +```bash +# Skip a few in-house canaries during a filesystem scan +kingfisher scan repo/ \ + --skip-aws-account "111122223333,444455556666" + +# Read a longer list from disk +kingfisher scan repo/ \ + --skip-aws-account-file /tmp/scripts/canary_accounts.txt + +# Combine preloaded canary IDs with a just-created decoy account +printf '999900001111\n534261010715\n' > /tmp/new_canary.txt + +kingfisher scan /path/to/repo \ + --skip-aws-account-file /tmp/new_canary.txt + +``` + +Tip: if you manage multiple canary fleets (Thinkst, self-hosted alternatives, or bespoke decoys), checkpoint the account IDs alongside your infrastructure-as-code so security teams can rotate or expand the skip list without editing pipelines. + +### Inline ignore directives + +Add `kingfisher:ignore` anywhere on the same line as a finding to silence it. Multi-line strings and PEM-style blocks may also be ignored by placing the directive on the closing delimiter line (for example, `""" # kingfisher:ignore`), on the next logical line after the string, **or** on a comment immediately before the value: + +```python +# kingfisher:ignore +API_KEY = """ +line 1 +line 2 +""" +# kingfisher:ignore +``` + +Kingfisher searches the surrounding lines for these tokens without requiring language-specific comment markers. To reuse existing inline directives from other scanners, add them with repeatable `--ignore-comment` flags (for example `--ignore-comment "gitleaks:allow" --ignore-comment "NOSONAR"`). Use `--no-ignore` when you want to disable inline suppressions entirely. + + +## Finding Fingerprint + +The document below details the four-field formula (rule SHA-1, origin label, start & end offsets) hashed with XXH3-64 to create Kingfisher’s 64-bit finding fingerprint, and explains how this ID powers safe deduplication; plus how `--no-dedup` can be used shows every raw match. +See ([docs/FINGERPRINT.md](docs/FINGERPRINT.md)) + +## Rule Performance Profiling + +Use `--rule-stats` to collect timing information for every rule. After scanning, the summary prints a **Rule Performance Stats** section showing how many matches each rule produced along with its slowest and average match times. Useful when creating rules or debugging rules. + +## CLI Options + +```bash +kingfisher scan --help +``` + + +## Lineage and Evolution + +Kingfisher began as an internal fork of Nosey Parker, used as a high-performance foundation for secret detection. + +Since then it has evolved far beyond that starting point, introducing live validation, hundreds of new rules, additional scan targets, and major architectural changes across nearly every subsystem. + +**Key areas of evolution** +- **Live validation** of detected secrets directly within rules +- **Hundreds of new built-in rules** and an expanded YAML rule schema +- **Baseline management** to suppress known findings over time +- **Tree-sitter parsing** layered on Hyperscan for language-aware detection +- **More scan targets** (GitLab, Bitbucket, Gitea, Jira, Confluence, Slack, S3, GCS, Docker, Hugging Face, etc.) +- **Compressed Files** scanning support added +- **New storage model** (in-memory + Bloom filter, replacing SQLite) +- **Unified workflow** with JSON/BSON/SARIF outputs +- **Cross-platform builds** for Linux, macOS, and Windows + +# Roadmap + +- More rules +- More targets +- Please file a [feature request](https://github.com/mongodb/kingfisher/issues), or open a PR, if you have features you'd like added + +# License + +[Apache2 License](LICENSE) diff --git a/README.md.rej b/README.md.rej new file mode 100644 index 0000000..af82147 --- /dev/null +++ b/README.md.rej @@ -0,0 +1,68 @@ +@@ -311,54 +311,63 @@ + | **Dev & CI/CD** | GitHub/GitLab tokens, CircleCI, TravisCI, TeamCity, Docker Hub, npm, PyPI, and more | + | **Messaging & Comms** | Slack, Discord, Microsoft Teams, Twilio, Mailgun, SendGrid, Mailchimp, and more | + | **Databases & Data Ops** | MongoDB Atlas, PlanetScale, Postgres DSNs, Grafana Cloud, Datadog, Dynatrace, and more | + | **Payments & Billing** | Stripe, PayPal, Square, GoCardless, and more | + | **Security & DevSecOps** | Snyk, Dependency-Track, CodeClimate, Codacy, OpsGenie, PagerDuty, and more | + | **Misc. SaaS & Tools** | 1Password, Adobe, Atlassian/Jira, Asana, Netlify, Baremetrics, and more | + + ## 📝 Write Custom Rules! + + Kingfisher ships with hundreds of rules with HTTP and service‑specific validation checks (AWS, Azure, GCP, etc.) to confirm if a detected string is a live credential. + + However, you may want to add your own custom rules, or modify a detection to better suit your needs / environment. + + First, review [docs/RULES.md](/docs/RULES.md) to learn how to create custom Kingfisher rules. + + ### Pattern requirements and placeholder filtering + + Every rule can declare optional `pattern_requirements` to enforce additional character checks after a regex matches. Each field + is independent: + + - `min_digits`, `min_uppercase`, `min_lowercase`, and `min_special_chars` enforce complexity thresholds. + - `special_chars` lets you override the set of characters counted as "special" when `min_special_chars` is used. + - `ignore_if_contains` lists case-insensitive substrings that should cause a match to be discarded (for example, to drop + `test`, `demo`, or `localhost` values). Kingfisher still accepts the legacy `exclude_words` key as an alias when loading + existing rule files. +- +-When a match is skipped because of `ignore_if_contains`, Kingfisher logs the event at the `DEBUG` level alongside the rule that +-was evaluated. If you need to keep those matches for a particular scan, pass `--no-ignore-if-contains` to `kingfisher scan` to +-disable the substring filter without editing any rule files. ++- `checksum` lets you compare an extracted portion of the match against a Liquid-rendered expectation. Provide `actual.template` ++ and `expected` Liquid snippets (with access to `{{ MATCH }}`, `{{ FULL_MATCH }}`, and any named capture as both its original ++ case and uppercase alias) and Kingfisher will skip the finding when the rendered values differ. Optional keys such as ++ `requires_capture` and `skip_if_missing` help you guard against legacy formats while onboarding the checksum-aware variant. ++ ++When a match is skipped because of `ignore_if_contains` or a checksum mismatch, Kingfisher logs the event at the `DEBUG` level ++alongside the rule that was evaluated. If you need to keep those matches for a particular scan, pass `--no-ignore-if-contains` ++to `kingfisher scan` to disable the substring filter without editing any rule files. Verbose mode (`-v`) will also show you the ++checksum mismatch lengths so you can confirm why a finding was suppressed. ++ ++To support checksum workflows, Kingfisher now includes Liquid helpers such as `suffix` (to slice characters from a match), ++`crc32` (to hash the body), and `base62` (to encode and pad the checksum). You can mix these filters with your own templates to ++mirror provider-specific checksum implementations. + + Once you've done that, you can provide your custom rules (defined in a YAML file) and provide it to Kingfisher at runtime --- no recompiling required! + + # 🎉 Usage + + ## Basic Examples + + > **Note**  `kingfisher scan` detects whether the input is a Git repository or a plain directory, no extra flags required. + + ### Scan with secret validation + + ```bash + kingfisher scan /path/to/code + ## NOTE: This path can refer to: + # 1. a local git repo + # 2. a directory with many git repos + # 3. or just a folder with files and subdirectories + + ## To explicitly prevent scanning git commit history add: + # `--git-history=none` + ``` + + ### Scan a directory containing multiple Git repositories + + ```bash diff --git a/data/rules/github.yml b/data/rules/github.yml index f4c53d8..bf86cac 100644 --- a/data/rules/github.yml +++ b/data/rules/github.yml @@ -40,13 +40,18 @@ rules: pattern: | (?xi) \b - ( - ghp_ - [A-Z0-9]{36} + ( + ghp_(?P[A-Z0-9]{30})(?P[A-Z0-9]{6}) ) pattern_requirements: min_digits: 2 min_lowercase: 2 + checksum: + actual: + template: "{{ MATCH | suffix: 6 }}" + requires_capture: checksum + expected: "{{ BODY | crc32 | base62: 6 }}" + skip_if_missing: true min_entropy: 3.5 examples: - "GITHUB_KEY=ghp_XIxB7KMNdAr3zqWtQqhE94qglHqOzn1D1stg" @@ -82,11 +87,16 @@ rules: (?xi) \b ( - gho_ - [A-Z0-9]{36} + gho_(?P[A-Z0-9]{30})(?P[A-Z0-9]{6}) ) pattern_requirements: - min_digits: 2 + min_digits: 2 + checksum: + actual: + template: "{{ MATCH | suffix: 6 }}" + requires_capture: checksum + expected: "{{ BODY | crc32 | base62: 6 }}" + skip_if_missing: true min_entropy: 3.5 confidence: medium examples: @@ -119,7 +129,7 @@ rules: pattern: | (?xi) ( - ghu_[A-Z0-9]{36} + ghu_(?P[A-Z0-9]{30})(?P[A-Z0-9]{6}) ) examples: - ' "token": "ghu_16C7e42F292c69C2E7C10c838347Ae178B4a",' @@ -153,7 +163,7 @@ rules: pattern: | (?xi) ( - ghs_[A-Z0-9]{36} + ghs_(?P[A-Z0-9]{30})(?P[A-Z0-9]{6}) ) examples: - ' "token": "ghs_16C7e42F292c69C2E7C10c838347Ae178B4a",' @@ -187,7 +197,7 @@ rules: pattern: | (?xi) ( - ghr_[A-Z0-9]{76} + ghr_(?P[A-Z0-9]{30})(?P[A-Z0-9]{6}) ) examples: - ' "refresh_token": "ghr_1B4a2e77838347a7E420ce178F2E7c6912E169246c3CE1ccbF66C46812d16D5B1A9Dc86A1498",' diff --git a/docs/RULES.md b/docs/RULES.md index 8a0d964..262f66e 100644 --- a/docs/RULES.md +++ b/docs/RULES.md @@ -117,12 +117,15 @@ Below is the complete list of Liquid filters available in Kingfisher, along with | --------------------- | -------------------------------------------- | -------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------- | | `b64enc` | – | Base64-encodes the input using the standard alphabet. | `{{ TOKEN \| b64enc }}` | | `b64url_enc` | – | URL-safe Base64 (no padding). Useful for JWT headers & payloads. | `{{ TOKEN \| b64url_enc }}` | -| `b64dec` | – | Decodes a Base64 string. | `{{ "aGVsbG8=" \| b64dec }}` | +| `b64dec` | – | Decodes a Base64 string. | `{{ "aGVsbG8=" \| b64dec }}` | | `sha256` | – | Computes the SHA-256 hex digest of the input. | `{{ TOKEN \| sha256 }}` | +| `crc32` | – | Computes the CRC32 checksum of the input and returns a decimal value. | `{{ TOKEN \| crc32 }}` | | `hmac_sha1` | `key` (string) | Computes HMAC-SHA1 over the input, returns Base64-encoded result. | `{{ TOKEN \| hmac_sha1: "secret-key" }}` | | `hmac_sha256` | `key` (string) | Computes HMAC-SHA256 over the input, returns Base64-encoded result. | `{{ TOKEN \| hmac_sha256: "secret-key" }}` | | `hmac_sha384` | `key` (string) | Computes HMAC-SHA384 over the input, returns Base64-encoded result. | `{{ TOKEN \| hmac_sha384: "secret-key" }}` | | `random_string` | `len` (integer, optional) | Generates a cryptographically-secure random alphanumeric string of the specified length (default: 32). | `{{ "" \| random_string: 16 }}` | +| `suffix` | `len` (integer, optional) | Returns the last `len` characters from the string (default: full). | `{{ TOKEN \| suffix: 6 }}` | +| `base62` | `width` (integer, optional) | Encodes the input number as Base62, left-padding with zeros as needed. | `{{ TOKEN \| crc32 \| base62: 6 }}` | | `url_encode` | – | Percent-encodes the input according to RFC 3986. | `{{ TOKEN \| url_encode }}` | | `json_escape` | – | Escapes special characters so a string can be safely injected into JSON contexts. | `{{ TOKEN \| json_escape }}` | | `unix_timestamp` | – | Returns the current Unix epoch time in seconds (UTC). | `{{ "" \| unix_timestamp }}` | @@ -269,13 +272,21 @@ pattern_requirements: ignore_if_contains: # Optional: reject matches containing any of these (case-insensitive) - test - demo + checksum: # Optional: compare rendered values to drop mismatched formats + actual: + template: "{{ MATCH | suffix: 6 }}" # Liquid template for the observed checksum + requires_capture: checksum # (optional) skip unless this capture is present + expected: "{{ BODY | crc32 | base62: 6 }}" # Liquid template to render the expected checksum + skip_if_missing: true # (optional) treat missing captures as legacy tokens ``` All fields are optional. If `special_chars` is not specified, the default set includes: `!@#$%^&*()_+-=[]{}|;:'",.<>?/\`~` `ignore_if_contains` performs a case-insensitive substring check. If any entry (after trimming whitespace) appears within the match, the match is discarded. This is helpful for dropping known dummy tokens such as "test" or "demo" that otherwise satisfy the regex. -When this filter removes a match it is logged at the `DEBUG` level so you can see exactly which substring caused the skip. If you need to keep every match even when one of these substrings appears, pass `--no-ignore-if-contains` to `kingfisher scan`. The flag disables this post-processing step without changing the rule definitions. +The optional `checksum` block renders Liquid templates against the match to determine whether the captured checksum matches your expectation. Both templates gain access to `{{ MATCH }}`, `{{ FULL_MATCH }}`, and every named capture in two forms: the original capture name and its uppercase alias (e.g. `{{ body }}` and `{{ BODY }}`). Use helper filters like `suffix`, `crc32`, and `base62` to mirror provider-specific checksum pipelines. If a required capture is missing or the rendered values differ, Kingfisher skips the finding—logging the reason, including checksum lengths, at the `DEBUG` level. Set `skip_if_missing` to `true` to treat absent captures as legacy matches. + +When any of these filters remove a match it is logged at the `DEBUG` level so you can see exactly why the skip occurred. If you need to keep every match even when one of these substrings appears, pass `--no-ignore-if-contains` to `kingfisher scan`. The flag disables this post-processing step without changing the rule definitions. ### Example: Secure API Key diff --git a/src/findings_store.rs b/src/findings_store.rs index 23d8c7b..6148fc6 100644 --- a/src/findings_store.rs +++ b/src/findings_store.rs @@ -147,12 +147,23 @@ impl FindingsStore { │ 1. Optional duplicate filter (unchanged) │ └───────────────────────────────────────────────────────────────*/ if dedup { + // Prefer the full unnamed match (index 0). Fall back to a named TOKEN capture + // before using whatever capture is available. let snippet = m .groups .captures - .get(1) - .or_else(|| m.groups.captures.get(0)) - .map_or("", |c| c.value); + .iter() + .find(|c| c.name.is_none() && c.match_number == 0) + .map(|c| c.value) + .or_else(|| { + m.groups + .captures + .iter() + .find(|c| matches!(c.name.as_deref(), Some("TOKEN"))) + .map(|c| c.value) + }) + .or_else(|| m.groups.captures.get(0).map(|c| c.value)) + .unwrap_or(""); let origin_kind = match origin.first() { Origin::GitRepo(_) => "git", diff --git a/src/liquid_filters.rs b/src/liquid_filters.rs index e9d6ea4..5f02b9b 100644 --- a/src/liquid_filters.rs +++ b/src/liquid_filters.rs @@ -1,6 +1,7 @@ //! Collection of small Liquid filters that make HTTP validations & API-signing templates easy use base64::{engine::general_purpose, Engine}; +use crc32fast::Hasher; use hmac::{Hmac, Mac}; use liquid_core::{ Display_filter, Error as LiquidError, Expression, Filter, FilterParameters, FilterReflection, @@ -223,22 +224,90 @@ impl Filter for HmacSha384Filter { } // ── random_string ──────────────────────────────── -static_filter!( - /// Random alphanumeric string (default 32 chars). - RandomStringFilter { len: Option }, - "random_string", - |s: &RandomStringFilter, input: &dyn ValueView| -> String { - let n = s.len // explicit argument? - .or_else(|| input.to_kstr().parse().ok()) // else parse input - .unwrap_or(32); // else default +#[derive(Debug, FilterParameters)] +struct RandomStringArgs { + #[parameter(description = "Desired output length", arg_type = "integer")] + len: Option, +} - rand::rng() - .sample_iter(&Alphanumeric) - .take(n) - .map(char::from) - .collect() +#[derive(Clone, ParseFilter, FilterReflection, Default)] +#[filter( + name = "random_string", + description = "Random alphanumeric string (default 32 chars).", + parameters(RandomStringArgs), + parsed(RandomString) +)] +pub struct RandomStringFilter; + +#[derive(Debug, FromFilterParameters, Display_filter)] +#[name = "random_string"] +struct RandomString { + #[parameters] + args: RandomStringArgs, +} + +impl Filter for RandomString { + fn evaluate(&self, input: &dyn ValueView, runtime: &dyn Runtime) -> Result { + let args = self.args.evaluate(runtime)?; + let n = args + .len + .and_then(|value| { + let scalar = Value::scalar(value); + value_to_usize(&scalar) + }) + .or_else(|| input.to_kstr().parse().ok()) + .unwrap_or(32); + + let value: String = + rand::rng().sample_iter(&Alphanumeric).take(n).map(char::from).collect(); + + Ok(Value::scalar(value)) } -); +} + +#[derive(Debug, FilterParameters)] +struct SuffixArgs { + #[parameter(description = "Number of trailing characters to keep", arg_type = "integer")] + len: Option, +} + +#[derive(Clone, ParseFilter, FilterReflection, Default)] +#[filter( + name = "suffix", + description = "Return the suffix (last N characters) of the provided string.", + parameters(SuffixArgs), + parsed(Suffix) +)] +pub struct SuffixFilter; + +#[derive(Debug, FromFilterParameters, Display_filter)] +#[name = "suffix"] +struct Suffix { + #[parameters] + args: SuffixArgs, +} + +impl Filter for Suffix { + fn evaluate(&self, input: &dyn ValueView, runtime: &dyn Runtime) -> Result { + let args = self.args.evaluate(runtime)?; + let text = input.to_kstr(); + let requested = args + .len + .and_then(|value| { + let scalar = Value::scalar(value); + value_to_usize(&scalar) + }) + .unwrap_or_else(|| text.len()); + if requested == 0 { + return Ok(Value::scalar(String::new())); + } + + let mut chars: Vec = text.chars().collect(); + let keep = requested.min(chars.len()); + chars.drain(0..chars.len().saturating_sub(keep)); + Ok(Value::scalar(chars.into_iter().collect::())) + } +} #[derive(Debug, Clone, Default, FilterReflection, ParseFilter)] #[filter( @@ -307,6 +376,111 @@ static_filter!( } ); +static_filter!( + /// Compute the CRC32 of the input and return it as a decimal number. + Crc32Filter, + "crc32", + |input: &dyn ValueView| -> i64 { + let mut hasher = Hasher::new(); + hasher.update(input.to_kstr().as_bytes()); + i64::from(hasher.finalize()) + } +); + +#[derive(Debug, FilterParameters)] +struct Base62Args { + #[parameter( + description = "Pad the encoded value to at least this width", + arg_type = "integer" + )] + width: Option, +} + +#[derive(Clone, ParseFilter, FilterReflection, Default)] +#[filter( + name = "base62", + description = "Encode the provided integer value using Base62.", + parameters(Base62Args), + parsed(Base62) +)] +pub struct Base62Filter; + +#[derive(Debug, FromFilterParameters, Display_filter)] +#[name = "base62"] +struct Base62 { + #[parameters] + args: Base62Args, +} + +impl Filter for Base62 { + fn evaluate(&self, input: &dyn ValueView, runtime: &dyn Runtime) -> Result { + let args = self.args.evaluate(runtime)?; + let value = input + .as_scalar() + .and_then(|scalar| { + if let Some(int) = scalar.to_integer() { + Some(if int < 0 { 0 } else { int as u64 }) + } else if let Some(float) = scalar.to_float() { + Some(if float.is_sign_negative() { 0 } else { float.floor() as u64 }) + } else if let Some(boolean) = scalar.to_bool() { + Some(u64::from(boolean)) + } else { + scalar.to_kstr().to_string().parse::().ok() + } + }) + .or_else(|| input.to_kstr().to_string().parse::().ok()) + .unwrap_or(0); + + let mut encoded = encode_base62(value); + if let Some(width) = args.width.and_then(|value| { + let scalar = Value::scalar(value); + value_to_usize(&scalar) + }) { + if encoded.len() < width { + let mut padded = String::with_capacity(width); + for _ in 0..(width - encoded.len()) { + padded.push('0'); + } + padded.push_str(&encoded); + encoded = padded; + } + } + + Ok(Value::scalar(encoded)) + } +} + +fn encode_base62(mut value: u64) -> String { + const ALPHABET: &[u8; 62] = b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + if value == 0 { + return "0".to_string(); + } + let mut buf = Vec::new(); + while value > 0 { + let rem = (value % 62) as usize; + buf.push(ALPHABET[rem] as char); + value /= 62; + } + buf.iter().rev().collect() +} + +fn value_to_usize(value: &Value) -> Option { + let view = value.as_view(); + view.as_scalar() + .and_then(|scalar| { + if let Some(int) = scalar.to_integer() { + Some(if int < 0 { 0 } else { int as usize }) + } else if let Some(float) = scalar.to_float() { + Some(if float.is_sign_negative() { 0 } else { float.floor() as usize }) + } else if let Some(boolean) = scalar.to_bool() { + Some(if boolean { 1 } else { 0 }) + } else { + scalar.to_kstr().parse::().ok() + } + }) + .or_else(|| view.to_kstr().parse::().ok()) +} + // {{ value | b64url_enc }} – URL-safe base64 w/o padding static_filter!( /// Base64 URL-safe (no ‘=’ padding). @@ -415,6 +589,9 @@ pub fn register_all(builder: liquid::ParserBuilder) -> liquid::ParserBuilder { .filter(B64EncFilter::default()) .filter(B64DecFilter::default()) .filter(RandomStringFilter::default()) + .filter(SuffixFilter::default()) + .filter(Crc32Filter::default()) + .filter(Base62Filter::default()) .filter(HmacSha256::default()) .filter(HmacSha1::default()) .filter(HmacSha384::default()) @@ -461,6 +638,20 @@ mod tests { assert_eq!(render(r#"{{ "hello" | sha256 }}"#), expect); } + #[test] + fn suffix_filter() { + assert_eq!(render(r#"{{ "abcdef" | suffix: 3 }}"#), "def"); + assert_eq!(render(r#"{{ "short" | suffix: 10 }}"#), "short"); + assert_eq!(render(r#"{{ "value" | suffix: 0 }}"#), ""); + } + + #[test] + fn crc32_and_base62_filters() { + assert_eq!(render(r#"{{ "hello" | crc32 }}"#), "907060870"); + assert_eq!(render(r#"{{ "hello" | crc32 | base62 }}"#), "zNvy2"); + assert_eq!(render(r#"{{ "hello" | crc32 | base62: 6 }}"#), "0zNvy2"); + } + #[test] fn hmac_sha1_filter() { let key = b"key1"; diff --git a/src/main.rs b/src/main.rs index b52fcd3..99f0718 100644 --- a/src/main.rs +++ b/src/main.rs @@ -5,27 +5,27 @@ // * Fallback - system allocator (`system-alloc` feature) // ──────────────────────────────────────────────────────────── -// --- jemalloc (opt-in) --- -#[cfg(feature = "use-jemalloc")] -#[global_allocator] -static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; +// // --- jemalloc (opt-in) --- +// #[cfg(feature = "use-jemalloc")] +// #[global_allocator] +// static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; -// --- mimalloc (default) --- -#[cfg(all(not(feature = "use-jemalloc"), not(feature = "system-alloc")))] -#[global_allocator] -static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; - -// --- system allocator (explicit opt-out) --- -#[cfg(feature = "system-alloc")] -use std::alloc::System; -#[cfg(feature = "system-alloc")] -#[global_allocator] -static GLOBAL: System = System; +// // --- mimalloc (default) --- +// #[cfg(all(not(feature = "use-jemalloc"), not(feature = "system-alloc")))] +// #[global_allocator] +// static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; +// // --- system allocator (explicit opt-out) --- +// #[cfg(feature = "system-alloc")] // use std::alloc::System; +// #[cfg(feature = "system-alloc")] // #[global_allocator] // static GLOBAL: System = System; +use std::alloc::System; +#[global_allocator] +static GLOBAL: System = System; + use std::{ io::{IsTerminal, Read}, sync::{Arc, Mutex}, diff --git a/src/matcher.rs b/src/matcher.rs index fb39ab4..b820233 100644 --- a/src/matcher.rs +++ b/src/matcher.rs @@ -29,7 +29,7 @@ use crate::{ parser, parser::{Checker, Language}, rule_profiling::{ConcurrentRuleProfiler, RuleStats, RuleTimer}, - rules::rule::{PatternValidationResult, Rule}, + rules::rule::{PatternRequirementContext, PatternValidationResult, Rule}, rules_database::RulesDatabase, safe_list::{is_safe_match, is_user_match}, scanner_pool::ScannerPool, @@ -614,7 +614,12 @@ fn filter_match<'b>( // Check character requirements if specified if let Some(char_reqs) = rule.pattern_requirements() { - match char_reqs.validate(mi_bytes, respect_ignore_if_contains) { + let context = PatternRequirementContext { + regex: re, + captures: &captures, + full_match: full_bytes, + }; + match char_reqs.validate(mi_bytes, Some(context), respect_ignore_if_contains) { PatternValidationResult::Passed => {} PatternValidationResult::Failed => { debug!( @@ -623,6 +628,15 @@ fn filter_match<'b>( ); continue; } + PatternValidationResult::FailedChecksum { actual_len, expected_len } => { + debug!( + "Skipping match for rule {} due to checksum mismatch (actual_len={}, expected_len={})", + rule.id(), + actual_len, + expected_len + ); + continue; + } PatternValidationResult::IgnoredBySubstring { matched_term } => { debug!( "Skipping match for rule {} because it contains ignored term {matched_term}", @@ -790,40 +804,31 @@ impl SerializableCaptures { redact: bool, ) -> Self { let mut serialized_captures: SmallVec<[SerializableCapture; 2]> = SmallVec::new(); - // Process named captures - for name in re.capture_names().flatten() { - if let Some(capture) = captures.name(name) { - let value = if redact { - redact_value(&String::from_utf8_lossy(capture.as_bytes())) - } else { - String::from_utf8_lossy(capture.as_bytes()).to_string() - }; - serialized_captures.push(SerializableCapture { - name: Some(name.to_string()), - match_number: -1, - start: capture.start(), - end: capture.end(), - value: intern(&value), - }); - } - } - // Process unnamed captures (numbered groups) + + let capture_names: SmallVec<[Option; 4]> = + re.capture_names().map(|name| name.map(str::to_string)).collect(); + for i in 0..captures.len() { - if let Some(capture) = captures.get(i) { + if let Some(cap) = captures.get(i) { let value = if redact { - redact_value(&String::from_utf8_lossy(capture.as_bytes())) + redact_value(&String::from_utf8_lossy(cap.as_bytes())) } else { - String::from_utf8_lossy(capture.as_bytes()).to_string() + String::from_utf8_lossy(cap.as_bytes()).to_string() }; + let interned = intern(&value); + + let name = capture_names.get(i).and_then(|opt| opt.as_ref()).cloned(); + serialized_captures.push(SerializableCapture { - name: None, + name, match_number: i32::try_from(i).unwrap_or(0), - start: capture.start(), - end: capture.end(), - value: intern(&value), + start: cap.start(), + end: cap.end(), + value: interned, }); } } + SerializableCaptures { captures: serialized_captures } } } @@ -1182,6 +1187,7 @@ mod test { min_special_chars: None, special_chars: None, ignore_if_contains: Some(vec!["TEST".to_string()]), + checksum: None, }), })]; @@ -1244,6 +1250,7 @@ mod test { min_special_chars: None, special_chars: None, ignore_if_contains: Some(vec!["TEST".to_string()]), + checksum: None, }), })]; @@ -1500,4 +1507,24 @@ line2 Ok(()) } + + #[test] + fn serializes_captures_in_numeric_order() { + let re = + Regex::new(r"(?xi)\b(ghp_(?P[A-Z0-9]{3})(?P[A-Z0-9]{2}))").unwrap(); + let caps = re.captures(b"ghp_ABC12").expect("expected captures"); + + let serialized = SerializableCaptures::from_captures(&caps, b"", &re, false); + let entries: Vec<(Option<&str>, i32, &str)> = serialized + .captures + .iter() + .map(|cap| (cap.name.as_deref(), cap.match_number, cap.value)) + .collect(); + + assert_eq!(entries.len(), 4); + assert_eq!(entries[0], (None, 0, "ghp_ABC12")); + assert_eq!(entries[1], (None, 1, "ghp_ABC12")); + assert_eq!(entries[2], (Some("body"), 2, "ABC")); + assert_eq!(entries[3], (Some("checksum"), 3, "12")); + } } diff --git a/src/rules/rule.rs b/src/rules/rule.rs index f1f9615..2284e1f 100644 --- a/src/rules/rule.rs +++ b/src/rules/rule.rs @@ -10,6 +10,10 @@ use std::{ use anyhow::{anyhow, Context, Result}; use lazy_static::lazy_static; +use liquid::{ + model::{KString, Value}, + object, Parser, ParserBuilder, +}; use regex::Regex; use schemars::{ gen::SchemaGenerator, @@ -17,9 +21,12 @@ use schemars::{ JsonSchema, }; use serde::{Deserialize, Serialize}; +use tracing::debug; // use sha1::{Digest, Sha1}; use xxhash_rust::xxh3::xxh3_64; +use crate::liquid_filters; + /// Returns false as the default value. fn default_false() -> bool { false @@ -73,6 +80,42 @@ pub struct PatternRequirements { /// Words that should cause the match to be excluded when present (case-insensitive) #[serde(default)] pub ignore_if_contains: Option>, + /// Optional checksum validation configuration. + #[serde(default)] + pub checksum: Option, +} + +/// Defines a checksum validation strategy for a matched pattern. +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord, Hash, Clone)] +pub struct ChecksumRequirement { + /// Template describing how to extract the checksum from the match. + pub actual: ChecksumActual, + /// Template describing how to compute the expected checksum. + pub expected: String, + /// When true, checksum evaluation is skipped if the required capture is missing. + #[serde(default)] + pub skip_if_missing: bool, +} + +/// Describes how to extract the checksum value from a match. +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord, Hash, Clone)] +pub struct ChecksumActual { + /// Liquid template used to compute the checksum from the match. + pub template: String, + /// Optional capture group that must be present before evaluating the checksum. + #[serde(default)] + pub requires_capture: Option, +} + +/// Contextual information available when validating pattern requirements. +#[derive(Clone, Copy)] +pub struct PatternRequirementContext<'a> { + /// Compiled regex associated with the rule. + pub regex: &'a regex::bytes::Regex, + /// Captures for the current match. + pub captures: &'a regex::bytes::Captures<'a>, + /// Full bytes matched by the rule (capture group 0). + pub full_match: &'a [u8], } impl PatternRequirements { @@ -85,6 +128,7 @@ impl PatternRequirements { pub fn validate( &self, input: &[u8], + context: Option>, respect_ignore_if_contains: bool, ) -> PatternValidationResult { // Convert to string (lossy for non-UTF8) @@ -151,10 +195,84 @@ impl PatternRequirements { } } + if let Some(checksum) = &self.checksum { + let Some(ctx) = context else { + return if checksum.skip_if_missing { + PatternValidationResult::Passed + } else { + PatternValidationResult::Failed + }; + }; + + if let Some(required) = checksum.actual.requires_capture.as_deref() { + if ctx.captures.name(required).is_none() { + return if checksum.skip_if_missing { + PatternValidationResult::Passed + } else { + PatternValidationResult::Failed + }; + } + } + + let mut globals = object!({ + "MATCH": s.to_string(), + "FULL_MATCH": String::from_utf8_lossy(ctx.full_match).to_string(), + }); + + for name in ctx.regex.capture_names().flatten() { + if let Some(capture) = ctx.captures.name(name) { + let value = String::from_utf8_lossy(capture.as_bytes()).to_string(); + globals.insert(KString::from_ref(name), Value::scalar(value.clone())); + globals.insert( + KString::from_string(name.to_ascii_uppercase()), + Value::scalar(value), + ); + } + } + + let actual = + match render_pattern_requirement_template(&checksum.actual.template, &globals) { + Ok(rendered) => rendered, + Err(err) => { + debug!( + "Failed to render checksum actual template '{}': {}", + checksum.actual.template, err + ); + return PatternValidationResult::Failed; + } + }; + let expected = match render_pattern_requirement_template(&checksum.expected, &globals) { + Ok(rendered) => rendered, + Err(err) => { + debug!( + "Failed to render checksum expected template '{}': {}", + checksum.expected, err + ); + return PatternValidationResult::Failed; + } + }; + + if actual != expected { + let actual_len = actual.chars().count(); + let expected_len = expected.chars().count(); + return PatternValidationResult::FailedChecksum { actual_len, expected_len }; + } + } + PatternValidationResult::Passed } } +fn render_pattern_requirement_template( + template: &str, + globals: &liquid::Object, +) -> Result { + PATTERN_REQUIREMENTS_TEMPLATE_PARSER + .parse(template) + .map_err(|e| e.to_string()) + .and_then(|parsed| parsed.render(globals).map_err(|e| e.to_string())) +} + /// Result of validating [`PatternRequirements`] against a potential match. #[derive(Debug, PartialEq, Eq)] pub enum PatternValidationResult { @@ -162,6 +280,8 @@ pub enum PatternValidationResult { Passed, /// Requirements were not satisfied. Failed, + /// Checksum requirements were not satisfied; captures basic mismatch details for debugging. + FailedChecksum { actual_len: usize, expected_len: usize }, /// The match contains one of the `ignore_if_contains` substrings and should be skipped. IgnoredBySubstring { matched_term: String }, } @@ -407,6 +527,10 @@ lazy_static! { pub static ref RULE_COMMENTS_PATTERN: Regex = Regex::new( r"(?m)(\(\?#[^)]*\))|(\s\#[\sa-zA-Z]*$)" ).expect("comment-stripping regex should compile"); + static ref PATTERN_REQUIREMENTS_TEMPLATE_PARSER: liquid::Parser = + liquid_filters::register_all(ParserBuilder::with_stdlib()) + .build() + .expect("pattern requirement template parser should compile"); } impl RuleSyntax { @@ -564,6 +688,7 @@ impl Rule { #[cfg(test)] mod tests { use super::*; + use regex::bytes::Regex as BytesRegex; #[test] fn test_pattern_requirements_digits() { @@ -574,16 +699,75 @@ mod tests { min_special_chars: None, special_chars: None, ignore_if_contains: None, + checksum: None, }; // Should pass: has 3 digits - assert!(matches!(reqs.validate(b"abc123def", true), PatternValidationResult::Passed)); + assert!(matches!(reqs.validate(b"abc123def", None, true), PatternValidationResult::Passed)); // Should fail: only 1 digit - assert!(matches!(reqs.validate(b"abc1def", true), PatternValidationResult::Failed)); + assert!(matches!(reqs.validate(b"abc1def", None, true), PatternValidationResult::Failed)); // Should fail: no digits - assert!(matches!(reqs.validate(b"abcdef", true), PatternValidationResult::Failed)); + assert!(matches!(reqs.validate(b"abcdef", None, true), PatternValidationResult::Failed)); + } + + #[test] + fn test_pattern_requirements_checksum() { + let reqs = PatternRequirements { + min_digits: None, + min_uppercase: None, + min_lowercase: None, + min_special_chars: None, + special_chars: None, + ignore_if_contains: None, + checksum: Some(ChecksumRequirement { + actual: ChecksumActual { + template: "{{ MATCH | suffix: 6 }}".to_string(), + requires_capture: Some("checksum".to_string()), + }, + expected: "{{ BODY | crc32 | base62: 6 }}".to_string(), + skip_if_missing: true, + }), + }; + + let token = b"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"; + let regex = + BytesRegex::new(r"(?x) ghp_(?P[A-Za-z0-9]{30})(?P[A-Za-z0-9]{6})") + .unwrap(); + let captures = regex.captures(token).expect("token should match"); + assert!(matches!( + reqs.validate( + token, + Some(PatternRequirementContext { + regex: ®ex, + captures: &captures, + full_match: token + }), + true + ), + PatternValidationResult::Passed + )); + + let mut invalid = token.to_vec(); + *invalid.last_mut().unwrap() = b'0'; + let captures_invalid = + regex.captures(&invalid).expect("invalid token should still match pattern"); + assert!(matches!( + reqs.validate( + &invalid, + Some(PatternRequirementContext { + regex: ®ex, + captures: &captures_invalid, + full_match: &invalid, + }), + true + ), + PatternValidationResult::FailedChecksum { .. } + )); + + let legacy = b"ghp_legacy_token"; + assert!(matches!(reqs.validate(legacy, None, true), PatternValidationResult::Passed)); } #[test] @@ -595,16 +779,17 @@ mod tests { min_special_chars: None, special_chars: None, ignore_if_contains: None, + checksum: None, }; // Should pass: has 3 uppercase - assert!(matches!(reqs.validate(b"ABCdef", true), PatternValidationResult::Passed)); + assert!(matches!(reqs.validate(b"ABCdef", None, true), PatternValidationResult::Passed)); // Should fail: only 1 uppercase - assert!(matches!(reqs.validate(b"Adef", true), PatternValidationResult::Failed)); + assert!(matches!(reqs.validate(b"Adef", None, true), PatternValidationResult::Failed)); // Should fail: no uppercase - assert!(matches!(reqs.validate(b"abcdef", true), PatternValidationResult::Failed)); + assert!(matches!(reqs.validate(b"abcdef", None, true), PatternValidationResult::Failed)); } #[test] @@ -616,16 +801,17 @@ mod tests { min_special_chars: None, special_chars: None, ignore_if_contains: None, + checksum: None, }; // Should pass: has 3 lowercase - assert!(matches!(reqs.validate(b"ABCdef", true), PatternValidationResult::Passed)); + assert!(matches!(reqs.validate(b"ABCdef", None, true), PatternValidationResult::Passed)); // Should fail: only 1 lowercase - assert!(matches!(reqs.validate(b"ABCd", true), PatternValidationResult::Failed)); + assert!(matches!(reqs.validate(b"ABCd", None, true), PatternValidationResult::Failed)); // Should fail: no lowercase - assert!(matches!(reqs.validate(b"ABC123", true), PatternValidationResult::Failed)); + assert!(matches!(reqs.validate(b"ABC123", None, true), PatternValidationResult::Failed)); } #[test] @@ -637,16 +823,17 @@ mod tests { min_special_chars: Some(2), special_chars: None, // uses default ignore_if_contains: None, + checksum: None, }; // Should pass: has 2 special chars - assert!(matches!(reqs.validate(b"abc!@def", true), PatternValidationResult::Passed)); + assert!(matches!(reqs.validate(b"abc!@def", None, true), PatternValidationResult::Passed)); // Should fail: only 1 special char - assert!(matches!(reqs.validate(b"abc!def", true), PatternValidationResult::Failed)); + assert!(matches!(reqs.validate(b"abc!def", None, true), PatternValidationResult::Failed)); // Should fail: no special chars - assert!(matches!(reqs.validate(b"abcdef", true), PatternValidationResult::Failed)); + assert!(matches!(reqs.validate(b"abcdef", None, true), PatternValidationResult::Failed)); } #[test] @@ -658,16 +845,17 @@ mod tests { min_special_chars: Some(2), special_chars: Some("$%^".to_string()), ignore_if_contains: None, + checksum: None, }; // Should pass: has 2 custom special chars - assert!(matches!(reqs.validate(b"abc$%def", true), PatternValidationResult::Passed)); + assert!(matches!(reqs.validate(b"abc$%def", None, true), PatternValidationResult::Passed)); // Should fail: has special chars but not the custom ones - assert!(matches!(reqs.validate(b"abc!@def", true), PatternValidationResult::Failed)); + assert!(matches!(reqs.validate(b"abc!@def", None, true), PatternValidationResult::Failed)); // Should fail: only 1 custom special char - assert!(matches!(reqs.validate(b"abc$def", true), PatternValidationResult::Failed)); + assert!(matches!(reqs.validate(b"abc$def", None, true), PatternValidationResult::Failed)); } #[test] @@ -679,22 +867,23 @@ mod tests { min_special_chars: Some(1), special_chars: None, ignore_if_contains: None, + checksum: None, }; // Should pass: has all requirements - assert!(matches!(reqs.validate(b"Abc1!", true), PatternValidationResult::Passed)); + assert!(matches!(reqs.validate(b"Abc1!", None, true), PatternValidationResult::Passed)); // Should fail: missing digit - assert!(matches!(reqs.validate(b"Abc!", true), PatternValidationResult::Failed)); + assert!(matches!(reqs.validate(b"Abc!", None, true), PatternValidationResult::Failed)); // Should fail: missing uppercase - assert!(matches!(reqs.validate(b"abc1!", true), PatternValidationResult::Failed)); + assert!(matches!(reqs.validate(b"abc1!", None, true), PatternValidationResult::Failed)); // Should fail: missing lowercase - assert!(matches!(reqs.validate(b"ABC1!", true), PatternValidationResult::Failed)); + assert!(matches!(reqs.validate(b"ABC1!", None, true), PatternValidationResult::Failed)); // Should fail: missing special - assert!(matches!(reqs.validate(b"Abc1", true), PatternValidationResult::Failed)); + assert!(matches!(reqs.validate(b"Abc1", None, true), PatternValidationResult::Failed)); } #[test] @@ -706,22 +895,26 @@ mod tests { min_special_chars: None, special_chars: None, ignore_if_contains: Some(vec!["test".to_string(), "Demo".to_string()]), + checksum: None, }; // Should fail: contains "test" (case-insensitive) assert!(matches!( - reqs.validate(b"MyTestToken", true), + reqs.validate(b"MyTestToken", None, true), PatternValidationResult::IgnoredBySubstring { .. } )); // Should fail: contains "demo" (case-insensitive) assert!(matches!( - reqs.validate(b"example-demo-value", true), + reqs.validate(b"example-demo-value", None, true), PatternValidationResult::IgnoredBySubstring { .. } )); // Should pass: does not contain excluded words - assert!(matches!(reqs.validate(b"example-value", true), PatternValidationResult::Passed)); + assert!(matches!( + reqs.validate(b"example-value", None, true), + PatternValidationResult::Passed + )); } #[test] @@ -733,14 +926,15 @@ mod tests { min_special_chars: None, special_chars: None, ignore_if_contains: Some(vec![" ".to_string(), "".to_string(), "BLOCK".to_string()]), + checksum: None, }; // Should fail only when non-empty exclusion matches assert!(matches!( - reqs.validate(b"needs-blocking", true), + reqs.validate(b"needs-blocking", None, true), PatternValidationResult::IgnoredBySubstring { .. } )); - assert!(matches!(reqs.validate(b"allowed", true), PatternValidationResult::Passed)); + assert!(matches!(reqs.validate(b"allowed", None, true), PatternValidationResult::Passed)); } #[test] @@ -752,16 +946,20 @@ mod tests { min_special_chars: None, special_chars: None, ignore_if_contains: Some(vec!["ignoreme".to_string()]), + checksum: None, }; // With ignoring enabled, the match is skipped assert!(matches!( - reqs.validate(b"value-ignoreme", true), + reqs.validate(b"value-ignoreme", None, true), PatternValidationResult::IgnoredBySubstring { .. } )); // With ignoring disabled, the same input passes requirements - assert!(matches!(reqs.validate(b"value-ignoreme", false), PatternValidationResult::Passed)); + assert!(matches!( + reqs.validate(b"value-ignoreme", None, false), + PatternValidationResult::Passed + )); } #[test] @@ -773,11 +971,12 @@ mod tests { min_special_chars: None, special_chars: None, ignore_if_contains: None, + checksum: None, }; // Should pass: no requirements - assert!(matches!(reqs.validate(b"anything", true), PatternValidationResult::Passed)); - assert!(matches!(reqs.validate(b"123", true), PatternValidationResult::Passed)); - assert!(matches!(reqs.validate(b"!@#", true), PatternValidationResult::Passed)); + assert!(matches!(reqs.validate(b"anything", None, true), PatternValidationResult::Passed)); + assert!(matches!(reqs.validate(b"123", None, true), PatternValidationResult::Passed)); + assert!(matches!(reqs.validate(b"!@#", None, true), PatternValidationResult::Passed)); } } diff --git a/src/validation.rs b/src/validation.rs index 073b615..3d47d48 100644 --- a/src/validation.rs +++ b/src/validation.rs @@ -332,9 +332,7 @@ async fn timed_validate_single_match<'a>( } let mut globals = Object::new(); - for (k, v, ..) in &captured_values { - globals.insert(k.to_uppercase().into(), Value::scalar(v.clone())); - } + populate_globals_from_captures(&mut globals, &captured_values); let rule_syntax = m.rule.syntax(); @@ -961,6 +959,59 @@ async fn timed_validate_single_match<'a>( commit_and_return(m); } +fn populate_globals_from_captures( + globals: &mut Object, + captured_values: &[(String, String, usize, usize)], +) { + let mut best_token: Option<(usize, String)> = None; + + for (k, v, ..) in captured_values { + let key = k.to_uppercase(); + if key == "TOKEN" { + if best_token.as_ref().map_or(true, |(len, _)| v.len() >= *len) { + best_token = Some((v.len(), v.clone())); + } + } else { + globals.insert(key.into(), Value::scalar(v.clone())); + } + } + + if let Some((_, token)) = best_token { + globals.insert("TOKEN".into(), Value::scalar(token)); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn populate_globals_prefers_longest_token() { + let captured_values = vec![ + ("TOKEN".to_string(), "short".to_string(), 0usize, 5usize), + ("BODY".to_string(), "body".to_string(), 0usize, 4usize), + ("TOKEN".to_string(), "longervalue".to_string(), 0usize, 11usize), + ]; + + let mut globals = Object::new(); + populate_globals_from_captures(&mut globals, &captured_values); + + assert_eq!(globals.get("TOKEN").map(|v| v.to_string()), Some("longervalue".to_string())); + assert_eq!(globals.get("BODY").map(|v| v.to_string()), Some("body".to_string())); + } + + #[test] + fn populate_globals_handles_missing_token() { + let captured_values = vec![("CHECKSUM".to_string(), "123456".to_string(), 0usize, 6usize)]; + + let mut globals = Object::new(); + populate_globals_from_captures(&mut globals, &captured_values); + + assert!(globals.get("TOKEN").is_none()); + assert_eq!(globals.get("CHECKSUM").map(|v| v.to_string()), Some("123456".to_string())); + } +} + // #[cfg(test)] // mod tests { // use std::sync::Arc; diff --git a/src/validation/utils.rs b/src/validation/utils.rs index 9e8422d..4db50a3 100644 --- a/src/validation/utils.rs +++ b/src/validation/utils.rs @@ -6,19 +6,11 @@ use crate::validation::SerializableCaptures; /// Return (NAME, value, start, end) for every capture we care about. /// /// * If a capture has a name, use that (upper-cased) -/// * If it’s unnamed, fall back to `"TOKEN"` -/// * Skip the unnamed “whole-match” capture **only when** there are -/// additional captures to return. +/// * If it’s unnamed, fall back to `"TOKEN"` pub fn process_captures(captures: &SerializableCaptures) -> Vec<(String, String, usize, usize)> { - let multiple = captures.captures.len() > 1; - captures .captures .iter() - // Skip the whole-match capture (match_number == 0) only when there - // are additional captures. All other captures – named or unnamed – - // should be preserved. - .filter(|cap| !multiple || cap.match_number != 0) .map(|cap| { let name = cap.name.as_ref().map(|n| n.to_uppercase()).unwrap_or_else(|| "TOKEN".to_string()); @@ -140,7 +132,7 @@ mod tests { } #[test] - fn skips_whole_match_when_multiple() { + fn includes_whole_match_when_multiple() { let captures = SerializableCaptures { captures: smallvec![ SerializableCapture { @@ -160,11 +152,17 @@ mod tests { ], }; let result = process_captures(&captures); - assert_eq!(result, vec![("FOO".to_string(), "bcd".to_string(), 1usize, 4usize)]); + assert_eq!( + result, + vec![ + ("TOKEN".to_string(), "abcde".to_string(), 0usize, 5usize), + ("FOO".to_string(), "bcd".to_string(), 1usize, 4usize), + ] + ); } #[test] - fn includes_unnamed_groups_but_skips_whole_match() { + fn includes_whole_match_and_unnamed_groups() { let captures = SerializableCaptures { captures: smallvec![ SerializableCapture { @@ -188,6 +186,7 @@ mod tests { assert_eq!( result, vec![ + ("TOKEN".to_string(), "aabbcc".to_string(), 0usize, 6usize), ("FOO".to_string(), "aa".to_string(), 0usize, 2usize), ("TOKEN".to_string(), "cc".to_string(), 4usize, 6usize), ] From bd13877b0d4f9ab440c92cd3dbeddc70f52747cb Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Fri, 7 Nov 2025 16:31:52 -0800 Subject: [PATCH 12/19] Added checksum comparisons to pattern_requirements, new suffix, crc32, and base62 Liquid filters, and verbose logging so mismatched checksums are skipped with context rather than reported as findings. --- .gitignore | 2 + CHANGELOG.md.orig | 288 ---------- CHANGELOG.md.rej | 36 -- README.md.orig | 1336 --------------------------------------------- README.md.rej | 68 --- 5 files changed, 2 insertions(+), 1728 deletions(-) delete mode 100644 CHANGELOG.md.orig delete mode 100644 CHANGELOG.md.rej delete mode 100644 README.md.orig delete mode 100644 README.md.rej diff --git a/.gitignore b/.gitignore index 8cd8f36..e552e7c 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,8 @@ custom.py logs/* *.patch +*.orig +*.rej ### macOS ### # General diff --git a/CHANGELOG.md.orig b/CHANGELOG.md.orig deleted file mode 100644 index 9f94263..0000000 --- a/CHANGELOG.md.orig +++ /dev/null @@ -1,288 +0,0 @@ -# Changelog - -All notable changes to this project will be documented in this file. - -## [v1.62.0] -- This release is focused on further improving detection accuracy, before even attempting to validate findings. -- Added `pattern_requirements` checks to rules, providing lightweight post-regex character-class validation without lookarounds. -- Added an `ignore_if_contains` option to `PatternRequirements` to drop matches containing case-insensitive placeholder words, with tests covering the new behavior. -- Updated rules to adopt the new `pattern_requirements` support. -- Automatically enable `--no-dedup` when `--manage-baseline` is supplied so baseline management keeps every finding. - - -## [v1.61.0] -- Fixed local filesystem scans to keep `open_path_as_is` enabled when opening Git repositories and only disable it for diff-based scans. -- Created Linux and Windows specific installer script -- Updated diff-focused scanning so `--branch-root-commit` can be provided alongside `--branch`, letting you diff from a chosen commit while targeting a specific branch tip (still defaulting back to the `--branch` ref when the commit is omitted). -- Updated rules - -## [v1.60.0] -- Removed the `--bitbucket-username`, `--bitbucket-token`, and `--bitbucket-oauth-token` flags in favour of `KF_BITBUCKET_*` environment variables when authenticating to Bitbucket. -- Added provider-specific `kingfisher scan` subcommands (for example `kingfisher scan github …`) that translate into the legacy flags under the hood. The new layout keeps backwards compatibility while removing the wall of provider options from `kingfisher scan --help`. -- Updated the README so every provider example (GitHub, GitLab, Bitbucket, Azure Repos, Gitea, Hugging Face, Slack, Jira, Confluence, S3, GCS, Docker) uses the new subcommand style. -- Legacy provider flags (for example `--github-user`, `--gitlab-group`, `--bitbucket-workspace`, `--s3-bucket`) still work but now emit a deprecation warning to encourage migration to the new `kingfisher scan ` flow. -- Kept the direct `kingfisher scan /path/to/dir` flow for local filesystem / local git repo scans while adding a `--list-only` switch to each provider subcommand so repository enumeration no longer requires the standalone `github repos`, `gitlab repos`, etc. commands. -- Removed the legacy top-level provider commands (`kingfisher github`, `kingfisher gitlab`, `kingfisher gitea`, `kingfisher bitbucket`, `kingfisher azure`, `kingfisher huggingface`) now that enumeration lives under `kingfisher scan --list-only`. - -## [v1.59.0] -- Fixed `kingfisher scan github …` (and other provider-specific subcommands) so they no longer demand placeholder path arguments before the CLI accepts the request. -- Fixed `kingfisher scan` so that providing `--branch` without `--since-commit` now diffs the branch against the empty tree and scans every commit reachable from that branch. -- Added rules for meraki, duffel, finnhub, frameio, freshbooks, gitter, infracost, launchdarkly, lob, maxmind, messagebird, nytimes, prefect, scalingo, sendinblue, sentry, shippo, twitch, typeform - -- ## [v1.58.0] -- Added first-class Hugging Face scanning support, including CLI enumeration, token authentication, and integration with remote scans. -- Condensed GitError formatting to report the exit status and the first informative lines from stdout/stderr, producing concise git clone failure logs. -- Added support for scanning Google Cloud Storage buckets via `--gcs-bucket`, including optional prefixes and service-account authentication. -- Added `--skip-aws-account` (now accepting comma-separated values) and `--skip-aws-account-file` to bypass live AWS validation for known canary/honey-token account IDs without triggering alerts. Kingfisher now ships with several canary AWS account IDs pre-seeded in the skip list and now reports matching findings as "Not Attempted" with the "Response" containing "(skip list entry)" so it's clear that validation was intentionally skipped and why. - -## [v1.57.0] -- Added inline ignore directive detection to treat suppression tokens anywhere on surrounding lines, including multi-line handling -- Added a `--no-ignore` CLI flag to disable inline directives when you need every potential secret reported -- Added: repeatable `--ignore-comment ` flag to reuse inline directives from other scanners (for example `NOSONAR`, `kics-scan ignore`, `gitleaks:allow`, etc) -- Respect user color settings in update messages by using the same color helper as the main reporter, ensuring consistent output and no ANSI codes on update check, when color is disabled - -## [v1.56.0] -- Fixed tree-sitter scanning bug where passing --no-base64 caused errors to be printed when the file type couldn’t be determined - -## [v1.55.0] -- Added first-class Azure Repos support, including CLI commands, enumeration, and documentation updates -- Improved performance of tree-sitter parsing -- Updated Windows build script to ensure static binary is produced - -## [v1.54.0] -- Added first-class Gitea support, including CLI commands, environment-based authentication, documentation, and integration with scans and repository enumeration. -- Populate the finding path from git blob metadata so history-derived secrets display their file location instead of an empty path -- Replaced Match::finding_id’s SHA1-based hashing with a fast xxh3_64 digest that keeps IDs deterministic while eliminating a hot-path SHA1 dependency - -## [v1.53.0] -- Added first-class Bitbucket support, including CLI commands, authentication helpers, documentation, and integration testing. - -## [v1.52.0] -- Enabled ANSI formatting in the tracing formatter whenever stderr is attached to a terminal so colorized updater messages render correctly instead of showing escape sequences. -- Added a new CLI flag, `--user-agent-suffix` to allow developers to append additional information to the user-agent -- Removed the unused --rlimit-nofile flag - -## [1.51.0] -- Added diff-only Git scanning via `--since-commit` and `--branch`, including remote-aware ref resolution so CI jobs can pair `--git-url` clones with pull request branches - -## [1.50.0] -- Added `--github-exclude` and `--gitlab-exclude` options to skip specific repositories when scanning or listing GitHub and GitLab sources, including support for gitignore-style glob patterns - -## [1.49.0] -- Enabled MongoDB URI validation -- AWS + GCP validators now respect HTTPS_PROXY and share a consistent user agent across AWS, GCP, and HTTP validation -- Increase max-file-size default to 256 mb (up from 64 mb) -- Improved AWS rule - -## [1.48.0] -- Improved error message when self-update cannot find the current binary -- Optimized memory usage via string interning and extensive data sharing -- Replaced quadratic match filtering with a per-rule span map, fixing missed secrets in extremely large files and improving scan performance -- Support scanning extremely large files by chunking input into 1 GiB segments with small overlaps, avoiding vectorscan buffer limits while preserving match offsets -- Always use chunked vectorscan, eliminating the slow regex fallback for blobs over 4 GiB -- Skip Base64 scanning for blobs over 64 MB to avoid a second pass over massive files -- Increased max-file-size default to 64 MB (up from 25 MB) - -## [1.47.0] -- MongoDB validator now validates `mongodb+srv://` URIs with a fast timeout instead of skipping them -- Improved rules: github oauth2, diffbot, mailchimp, aws -- Added validation to SauceLabs rule -- Added rules: shodan, bitly, flickr -- Decode Base64 blobs and scan their contents for secrets while skipping short strings for performance. This has a small performance impact and can be disabled with `--no-base64` - -## [1.46.0] -- Improved rules: AWS, pem -- Added rule for Ollama, Weights and Biases, Cerebras, Friendli, Fireworks.ai, NVIDIA NIM, together.ai, zhipu -- Added `self-update` command to update the binary independently. Now supports updating over homebrew managed binary -- MongoDB validator now checks `mongodb+srv://` URIs with fast-fail timeouts - -## [1.45.0] -- Added `--repo-artifacts` flag to scan repository issues, gists/snippets, and wikis when cloning via `--git-url` -- Added rules for sendbird, mattermost, langchain, notion -- JWT validation hardened to reject alg:none by default (only allowed if explicitly configured), require iss for OIDC/JWKS verification, ensuring "Active Credential" means cryptographically verified and time-valid, not just unexpired -- Updated the Git cloning logic to include all refs and minimize clone output, allowing Kingfisher to analyze pull request and deleted branch history - -## [1.44.0] -- Fixed issue with self-update on Linux -- Reverted the change to json and jsonl outputs by rule -- Added `--skip-regex` and `--skip-word` flags to ignore secrets matching custom patterns or skipwords - -## [1.43.0] -- Added rules for clearbit, kickbox, azure container registry, improved Azure Storage key -- Grouped JSON and JSONL outputs by rule, restoring `matches` arrays in reports - -## [1.42.0] -- Fixed pagination issue when calling gitlab api -- Expanded directory exclusion handling to interpret plain patterns as prefixes, ensuring options like --exclude .git also skip all nested paths -- Updated baseline management to track encountered findings and remove entries that are no longer present, saving the baseline file whenever entries are pruned or new matches are added -- Added rules for authress, clickhouse, codecov, contentful, curl, dropbox, fly.io, hubspot, firecrawl -- Internal refactoring of rule loader, git enumerator, and filetype guesser -- Improved language detection - -## [1.41.0] -- Added support for scanning gitlab subgroups, with `kingfisher scan --gitlab-group my-group --gitlab-include-subgroups` -- Added rule for Vercel - -## [1.40.0] -- Dropped the “prevalidated” flag from rule definitions and validation logic so every finding now flows through the standard active/inactive/unknown pipeline, simplifying rule configuration and preventing special‑case bypasses -- Improved Tailscale api key detectors - -## [1.39.0] -- Added support for scanning Confluence pages via `--confluence-url` and `--cql` - -## [1.38.0] -- `--quiet` now suppresses scan summaries and rule statistics unless `--rule-stats` is explicitly provided -- Added X Consumer key detection and validation - -## [1.37.0] -- GitLab: Matched GitLab group repository listings to glab by only enumerating projects that belong directly to each group, without automatically traversing subgroups - -## [1.36.0] -- Fixed GitHub organization and GitLab group scans when using `--git-history=none` -- JWT tokens without both `iss` and `aud` are no longer reported as active credentials - -## [1.35.0] -- Remote scans with `--git-history=none` now clone repositories with a working tree and scan the current files instead of erroring with "No inputs to scan". -- Fixed issue where `--redact` did not function properly -- Fixed validation logic for clarifai rule - -## [1.34.0] -- Use system TLS root certificates to support self-hosted GitLab instances with internal CAs -- Added new rule: Coze personal access token -- Updated Supabase rule to detect project url's and validate their corresponding tokens - -## [1.33.0] -- Fixed header precedence so custom HTTP validation headers like `Accept` are preserved -- Added new Heroku rule - -## [1.32.0] -- Added support for scanning AWS S3 buckets via `--s3-bucket` and optional `--s3-prefix` -- Added `--role-arn` and `--aws-local-profile` flags for S3 authentication alongside `KF_AWS_KEY`/`KF_AWS_SECRET` -- Added progress bar for scanning s3 buckets -- Refactored output reporting and formatting logic - -## [1.31.0] -- New rules: Telegram bot token, OpenWeatherMap, Apify, Groq -- New OpenAI detectors added (@joshlarsen) -- Fixed bug that broke validation when using unnamed group captures - -## [1.30.0] -- Fixed validation caching for HTTP validators to include rendered headers so inactive secrets no longer appear active. -- Removed pre-commit installation hook, due to bugs - -## [1.29.0] -- Fixed issue when more than 1 named capture group is used in a rule variable -- Added a new liquid template filters: `b64dec` -- Added custom validator for Coinbase, and a Coinbase rule that uses it - -## [1.28.0] -- Added support for scanning Slack - -## [1.27.0] -- Added Buildkite rule -- Added support for scanning Docker images via `--docker-image` - -## [1.26.0] -- Added rule for ElevenLabs -- Added support for scanning Jira issues via a given JQL (Jira Query Language) - -## [1.25.0] -- Fixed GitLab authentication bug -- Added pre-commit and pre-receive installation hooks -- MongoDB validator now skips `mongodb+srv://` URIs and returns a message that validation was skipped -- Fixed noisy Baseten rule - -## [1.24.0] -- Now generating DEB and RPM packages -- Now releasing Docker images, and updated README -- Added rule for Scale, Deepgram, AssemblyAI - - -## [1.23.0] -- Updating GitHub Action to generate Docker image -- Added rules for Diffbot, ai21, baseten -- Fixed supabase rule -- Added 'alg' to JWT validation output - -## [1.22.0] -- Added rules for Google Gemini AI, Cohere, Stability.ai, Replicate, Runway, Clarifai -- Upgraded dependencies - -## [1.21.0] -- Improved Azure Storage rule -- Added rule to detect TravisCI encrypted values -- Added baseline feature with `--baseline-file` and `--manage-baseline` flags -- Introduced `--exclude` option for skipping paths -- Added tests covering baseline and exclude workflow -- Added validation for JWT tokens that checks `exp` and `nbf` claims -- JWT validation performs OpenID Connect discovery using the `iss` claim and verifies signatures via JWKS -- Removed `--ignore-tests` argument, because the `--exclude` flag provides more granular functionality -- DigitalOcean rule update -- Adafruit rule update - -## [1.20.0] -- Removed confirmation prompt when user provides --self-update flag -- Added support for HTTP request bodies in rule validation -- Added new liquid-rs filters: HmacSha1, IsoTimestampNoFracFilter, Replace -- Added rules for mistral, perplexity -- Added validation for Alibaba rule -- Set GIT_TERMINAL_PROMPT=0 when cloning git repos - -## [1.19.0] -- JSON output was missing committer name and email -- Fixed Gitlab rule which was incorrectly identifying certain tokens as valid - -## [1.18.1] -- Restored --version cli argument -- Added test for the argument - -## [1.18.0] -- Added rules for DeepSeek, xAI -- Removed branding -- Added NOTICE file - -## [1.17.1] -- Fixed broken sourcegraph rule -- Added test to prevent this and similar issues - -## [1.17.0] -- Updated README to give proper attribution to Nosey Parker! -- Added rules for sonarcloud, sonarqube, sourcegraph, shopify, truenas, square, sendgrid, nasa, teamcity, truenas, shopify -- Introduced `--ignore-tests` flag – skip files/dirs whose path resembles tests (`test`, `spec`, `fixture`, `example`, `sample`), reducing noise. -## [1.16.0] -- Fix: HTML detection now requires both HTML content-type and "` flow. - - Kept the direct `kingfisher scan /path/to/dir` flow for local filesystem / local git repo scans while adding a `--list-only` switch to each provider subcommand so repository enumeration no longer requires the standalone `github repos`, `gitlab repos`, etc. commands. - - Removed the legacy top-level provider commands (`kingfisher github`, `kingfisher gitlab`, `kingfisher gitea`, `kingfisher bitbucket`, `kingfisher azure`, `kingfisher huggingface`) now that enumeration lives under `kingfisher scan --list-only`. - - ## [v1.59.0] - - Fixed `kingfisher scan github …` (and other provider-specific subcommands) so they no longer demand placeholder path arguments before the CLI accepts the request. - - Fixed `kingfisher scan` so that providing `--branch` without `--since-commit` now diffs the branch against the empty tree and scans every commit reachable from that branch. - - Added rules for meraki, duffel, finnhub, frameio, freshbooks, gitter, infracost, launchdarkly, lob, maxmind, messagebird, nytimes, prefect, scalingo, sendinblue, sentry, shippo, twitch, typeform - - - ## [v1.58.0] - - Added first-class Hugging Face scanning support, including CLI enumeration, token authentication, and integration with remote scans. - - Condensed GitError formatting to report the exit status and the first informative lines from stdout/stderr, producing concise git clone failure logs. - - Added support for scanning Google Cloud Storage buckets via `--gcs-bucket`, including optional prefixes and service-account authentication. diff --git a/README.md.orig b/README.md.orig deleted file mode 100644 index 95a4d68..0000000 --- a/README.md.orig +++ /dev/null @@ -1,1336 +0,0 @@ -# Kingfisher - -

- Kingfisher Logo - -[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
-[![ghcr downloads](https://ghcr-badge.elias.eu.org/shield/mongodb/kingfisher/kingfisher)](https://github.com/mongodb/kingfisher/pkgs/container/kingfisher)
- - -Kingfisher is a blazingly fast secret-scanning and **live validation** tool built in Rust. - -It combines Intel’s SIMD-accelerated regex engine (Hyperscan) with language-aware parsing to achieve high accuracy at massive scale, and **ships with hundreds of built-in rules** to detect, **validate**, and triage secrets before they ever reach production. - -For a look at how Kingfisher has grown from its early foundations into today's full-featured scanner, see [Lineage and Evolution](#lineage-and-evolution). - -

- -## Key Features - -### Multiple Scan Targets -
- -| Files / Dirs | Local Git | GitHub | GitLab | Azure Repos | Bitbucket | Gitea | Hugging Face | -|:-------------:|:----------:|:------:|:------:|:-------------:|:----------:|:------:|:-------------:| -| Files / Dirs
Files / Dirs | Local Git
Local Git | GitHub
GitHub | GitLab
GitLab | Azure Repos
Azure Repos | Bitbucket
Bitbucket | Gitea
Gitea |Hugging Face
Hugging Face | - -| Docker | Jira | Confluence | Slack | AWS S3 | Google Cloud | -|:------:|:----:|:-----------:|:-----:|:------:|:---:| -| Docker
Docker | Jira
Jira | Confluence
Confluence | Slack
Slack | AWS S3
AWS S3 | Google Cloud Storage
Cloud Storage | - -
- -### Performance, Accuracy, and Hundreds of Rules -- **Performance**: multithreaded, Hyperscan‑powered scanning built for huge codebases -- **Extensible rules**: hundreds of built-in detectors plus YAML-defined custom rules ([docs/RULES.md](/docs/RULES.md)) -- **Broad AI SaaS coverage**: finds and validates tokens for OpenAI, Anthropic, Google Gemini, Cohere, Mistral, Stability AI, Replicate, xAI (Grok), Ollama, Langchain, Perplexity, Weights & Biases, Cerebras, Friendli, Fireworks.ai, NVIDIA NIM, Together.ai, Zhipu, and many more -- **Compressed Files**: Supports extracting and scanning compressed files for secrets -- **Baseline management**: generate and track baselines to suppress known secrets ([docs/BASELINE.md](/docs/BASELINE.md)) - -**Learn more:** [Introducing Kingfisher: Real‑Time Secret Detection and Validation](https://www.mongodb.com/blog/post/product-release-announcements/introducing-kingfisher-real-time-secret-detection-validation) - -# Benchmark Results - -See ([docs/COMPARISON.md](docs/COMPARISON.md)) - -

- Kingfisher Runtime Comparison -

- -# Table of Contents - -
- -- [Kingfisher](#kingfisher) - - [Key Features](#key-features) - - [Multiple Scan Targets](#multiple-scan-targets) - - [Performance, Accuracy, and Hundreds of Rules](#performance-accuracy-and-hundreds-of-rules) -- [Benchmark Results](#benchmark-results) -- [Table of Contents](#table-of-contents) -- [Getting Started](#getting-started) - - [Installation](#installation) - - [Pre-built Releases](#pre-built-releases) - - [Homebrew](#homebrew) - - [Linux and macOS](#linux-and-macos) - - [Windows](#windows) - - [Compile](#compile) - - [ Run Kingfisher in Docker](#-run-kingfisher-in-docker) -- [🔐 Detection Rules at a Glance](#-detection-rules-at-a-glance) - - [📝 Write Custom Rules!](#-write-custom-rules) - - [Pattern requirements and placeholder filtering](#pattern-requirements-and-placeholder-filtering) -- [🎉 Usage](#-usage) - - [Basic Examples](#basic-examples) - - [Scan with secret validation](#scan-with-secret-validation) - - [Scan a directory containing multiple Git repositories](#scan-a-directory-containing-multiple-git-repositories) - - [Scan a Git repository without validation](#scan-a-git-repository-without-validation) - - [Display only secrets confirmed active by third‑party APIs](#display-only-secrets-confirmed-active-by-thirdparty-apis) - - [Output JSON and capture to a file](#output-json-and-capture-to-a-file) - - [Output SARIF directly to disk](#output-sarif-directly-to-disk) - - [Pipe any text directly into Kingfisher by passing `-`](#pipe-any-text-directly-into-kingfisher-by-passing--) - - [Limit maximum file size scanned (`--max-file-size`)](#limit-maximum-file-size-scanned---max-file-size) - - [Scan using a rule _family_ with one flag](#scan-using-a-rule-family-with-one-flag) - - [Display rule performance statistics](#display-rule-performance-statistics) - - [Scan while ignoring likely test files](#scan-while-ignoring-likely-test-files) - - [Exclude specific paths](#exclude-specific-paths) - - [Scan changes in CI pipelines](#scan-changes-in-ci-pipelines) - - [ Scanning an AWS S3 Bucket](#-scanning-an-aws-s3-bucket) - - [ Scanning a Google Cloud Storage Bucket](#-scanning-a-google-cloud-storage-bucket) - - [ Scanning Docker Images](#-scanning-docker-images) - - [ Scanning GitHub](#-scanning-github) - - [Scan GitHub organization (requires `KF_GITHUB_TOKEN`)](#scan-github-organization-requires-kf_github_token) - - [Skip specific GitHub repositories during enumeration](#skip-specific-github-repositories-during-enumeration) - - [Scan remote GitHub repository](#scan-remote-github-repository) - - [ Scanning GitLab](#-scanning-gitlab) - - [Scan GitLab group (requires `KF_GITLAB_TOKEN`)](#scan-gitlab-group-requires-kf_gitlab_token) - - [Scan GitLab user](#scan-gitlab-user) - - [Skip specific GitLab projects during enumeration](#skip-specific-gitlab-projects-during-enumeration) - - [Scan remote GitLab repository by URL](#scan-remote-gitlab-repository-by-url) - - [List GitLab repositories](#list-gitlab-repositories) - - [ Scanning Azure Repos](#-scanning-azure-repos) - - [Scan Azure Repos organization or collection (requires `KF_AZURE_TOKEN` or `KF_AZURE_PAT`)](#scan-azure-repos-organization-or-collection-requires-kf_azure_token-or-kf_azure_pat) - - [Scan specific Azure Repos projects](#scan-specific-azure-repos-projects) - - [Skip specific Azure repositories during enumeration](#skip-specific-azure-repositories-during-enumeration) - - [List Azure repositories](#list-azure-repositories) - - [ Scanning Gitea](#-scanning-gitea) - - [Scan Gitea organization (requires `KF_GITEA_TOKEN`)](#scan-gitea-organization-requires-kf_gitea_token) - - [Scan Gitea user](#scan-gitea-user) - - [Skip specific Gitea repositories during enumeration](#skip-specific-gitea-repositories-during-enumeration) - - [Scan remote Gitea repository by URL](#scan-remote-gitea-repository-by-url) - - [List Gitea repositories](#list-gitea-repositories) - - [ Scanning Bitbucket](#-scanning-bitbucket) - - [Scan Bitbucket workspace](#scan-bitbucket-workspace) - - [Scan Bitbucket user](#scan-bitbucket-user) - - [Skip specific Bitbucket repositories during enumeration](#skip-specific-bitbucket-repositories-during-enumeration) - - [Scan remote Bitbucket repository by URL](#scan-remote-bitbucket-repository-by-url) - - [List Bitbucket repositories](#list-bitbucket-repositories) - - [Authenticate to Bitbucket](#authenticate-to-bitbucket) - - [Self-hosted Bitbucket Server](#self-hosted-bitbucket-server) - - [ Scanning Hugging Face](#-scanning-hugging-face) - - [Scan Hugging Face user](#scan-hugging-face-user) - - [Scan Hugging Face organization](#scan-hugging-face-organization) - - [Scan specific Hugging Face resources](#scan-specific-hugging-face-resources) - - [List Hugging Face repositories](#list-hugging-face-repositories) - - [Authenticate to Hugging Face](#authenticate-to-hugging-face) - - [ Scanning Jira](#-scanning-jira) - - [Scan Jira issues matching a JQL query](#scan-jira-issues-matching-a-jql-query) - - [Scan the last 1,000 Jira issues:](#scan-the-last-1000-jira-issues) - - [ Scanning Confluence](#-scanning-confluence) - - [Scan Confluence pages matching a CQL query](#scan-confluence-pages-matching-a-cql-query) - - [ Scanning Slack](#-scanning-slack) - - [Scan Slack messages matching a search query](#scan-slack-messages-matching-a-search-query) - - [Environment Variables for Tokens](#environment-variables-for-tokens) - - [Exit Codes](#exit-codes) - - [Update Checks](#update-checks) -- [🤓 Advanced Options](#-advanced-options) - - [Build a Baseline / Detect New Secrets](#build-a-baseline--detect-new-secrets) - - [List Builtin Rules](#list-builtin-rules) - - [To scan using **only** your own `my_rules.yaml` you could run:](#to-scan-using-only-your-own-my_rulesyaml-you-could-run) - - [To add your rules alongside the built‑ins:](#to-add-your-rules-alongside-the-builtins) - - [Other Examples](#other-examples) - - [Customize the HTTP User-Agent](#customize-the-http-user-agent) - - [Notable Scan Options](#notable-scan-options) - - [Understanding `--confidence`](#understanding---confidence) - - [Ignore known false positives](#ignore-known-false-positives) - - [Skip Canary Tokens (AWS)](#skip-canary-tokens-aws) - - [Common CLI flows](#common-cli-flows) - - [Inline ignore directives](#inline-ignore-directives) - - [Finding Fingerprint](#finding-fingerprint) - - [Rule Performance Profiling](#rule-performance-profiling) - - [CLI Options](#cli-options) - - [Lineage and Evolution](#lineage-and-evolution) -- [Roadmap](#roadmap) -- [License](#license) - -
- - -# Getting Started -## Installation -### Pre-built Releases -Pre-built binaries are available from the [Releases](https://github.com/mongodb/kingfisher/releases) section. - -### Homebrew -![Homebrew Formula Version](https://img.shields.io/homebrew/v/kingfisher) - -```bash -brew install kingfisher -``` - -### Linux and macOS - -
- -Use the bundled installer script to fetch the latest release and place it in -`~/.local/bin` (or a directory of your choice): - -```bash -# Linux, macOS -curl --silent --location \ - https://raw.githubusercontent.com/mongodb/kingfisher/main/scripts/install-kingfisher.sh | \ - bash -``` - -To install into a custom location, pass the desired directory as an argument: - -```bash -curl --silent --location \ - https://raw.githubusercontent.com/mongodb/kingfisher/main/scripts/install-kingfisher.sh | \ - bash -s -- /opt/kingfisher -``` - -
- -### Windows - -
- -Download and run the PowerShell installer to place the binary in -`$env:USERPROFILE\bin` (or another directory you specify): - -```powershell -# Windows -Set-ExecutionPolicy -Scope Process -ExecutionPolicy Bypass -Force -Invoke-WebRequest -Uri 'https://raw.githubusercontent.com/mongodb/kingfisher/main/scripts/install-kingfisher.ps1' -OutFile install-kingfisher.ps1 -./install-kingfisher.ps1 -``` - -You can provide a custom destination using the `-InstallDir` parameter: - -```powershell -./install-kingfisher.ps1 -InstallDir 'C:\Tools\Kingfisher' -``` -
- - -### Compile -You may compile for your platform via `make` - -
- -```bash -# NOTE: Requires Docker -make linux - -# macOS --- must build from a macOS host -make darwin - -# Windows x64 --- requires building from a Windows host with Visual Studio installed -./buildwin.bat -force -``` - -```bash -# Build all targets -make linux-all # builds both x64 and arm64 -make darwin-all # builds both x64 and arm64 -make all # builds for every OS and architecture supported -``` - -
- -### Docker Run Kingfisher in Docker - -Run the dockerized Kingfisher container - -
- -```bash -# GitHub Container Registry -docker run --rm ghcr.io/mongodb/kingfisher:latest --version - -# Scan the current working directory -# (mounts your code at /src and scans it) -docker run --rm \ - -v "$PWD":/src \ - ghcr.io/mongodb/kingfisher:latest scan /src - - -# Scan while providing a GitHub token -# Mounts your working dir at /proj and passes in the token: -docker run --rm \ - -e KF_GITHUB_TOKEN=ghp_… \ - -v "$PWD":/proj \ - ghcr.io/mongodb/kingfisher:latest \ - scan --git-url https://github.com/org/private_repo.git - -# Scan an S3 bucket -# Credentials can come from KF_AWS_KEY/KF_AWS_SECRET, --role-arn, or --profile -docker run --rm \ - -e KF_AWS_KEY=AKIA... \ - -e KF_AWS_SECRET=g5nYW... \ - ghcr.io/mongodb/kingfisher:latest \ - scan s3 bucket-name - - -# Scan and write a JSON report locally -# Here we: -# 1. Mount $PWD → /proj -# 2. Tell Kingfisher to write findings.json inside /proj/reports -# 3. Ensure ./reports exists on your host so Docker can mount it -mkdir -p reports - -# run and output into host’s ./reports directory -docker run --rm \ - -v "$PWD":/proj \ - ghcr.io/mongodb/kingfisher:latest \ - scan /proj \ - --format json \ - --output /proj/reports/findings.json - - -# Tip: you can combine multiple mounts if you prefer separating source vs. output: -# Here /src is read‑only, and /out holds your generated reports -docker run --rm \ - -v "$PWD":/src:ro \ - -v "$PWD/reports":/out \ - ghcr.io/mongodb/kingfisher:latest \ - scan /src \ - --format json \ - --output /out/findings.json - -``` - -
- -# 🔐 Detection Rules at a Glance - -Kingfisher ships with [hundreds of rules](/data/rules/) that cover everything from classic cloud keys to the latest AI SaaS tokens. Below is an overview: - -| Category | What we catch | -|----------|---------------| -| **AI SaaS APIs** | OpenAI, Anthropic, Google Gemini, Cohere, Mistral, Stability AI, Replicate, xAI (Grok), Ollama, Langchain, Perplexity, Weights & Biases, Cerebras, Friendli, Fireworks.ai, NVIDIA NIM, together.ai, Zhipu, and more | -| **Cloud Providers** | AWS, Azure, GCP, Alibaba Cloud, DigitalOcean, IBM Cloud, Cloudflare, and more | -| **Dev & CI/CD** | GitHub/GitLab tokens, CircleCI, TravisCI, TeamCity, Docker Hub, npm, PyPI, and more | -| **Messaging & Comms** | Slack, Discord, Microsoft Teams, Twilio, Mailgun, SendGrid, Mailchimp, and more | -| **Databases & Data Ops** | MongoDB Atlas, PlanetScale, Postgres DSNs, Grafana Cloud, Datadog, Dynatrace, and more | -| **Payments & Billing** | Stripe, PayPal, Square, GoCardless, and more | -| **Security & DevSecOps** | Snyk, Dependency-Track, CodeClimate, Codacy, OpsGenie, PagerDuty, and more | -| **Misc. SaaS & Tools** | 1Password, Adobe, Atlassian/Jira, Asana, Netlify, Baremetrics, and more | - -## 📝 Write Custom Rules! - -Kingfisher ships with hundreds of rules with HTTP and service‑specific validation checks (AWS, Azure, GCP, etc.) to confirm if a detected string is a live credential. - -However, you may want to add your own custom rules, or modify a detection to better suit your needs / environment. - -First, review [docs/RULES.md](/docs/RULES.md) to learn how to create custom Kingfisher rules. - -### Pattern requirements and placeholder filtering - -Every rule can declare optional `pattern_requirements` to enforce additional character checks after a regex matches. Each field -is independent: - -- `min_digits`, `min_uppercase`, `min_lowercase`, and `min_special_chars` enforce complexity thresholds. -- `special_chars` lets you override the set of characters counted as "special" when `min_special_chars` is used. -- `ignore_if_contains` lists case-insensitive substrings that should cause a match to be discarded (for example, to drop - `test`, `demo`, or `localhost` values). - -When a match is skipped because of `ignore_if_contains`, Kingfisher logs the event at the `DEBUG` level alongside the rule that -was evaluated. If you need to keep those matches for a particular scan, pass `--no-ignore-if-contains` to `kingfisher scan` to -disable the substring filter without editing any rule files. - -Once you've done that, you can provide your custom rules (defined in a YAML file) and provide it to Kingfisher at runtime --- no recompiling required! - -# 🎉 Usage - -## Basic Examples - -> **Note**  `kingfisher scan` detects whether the input is a Git repository or a plain directory, no extra flags required. - -### Scan with secret validation - -```bash -kingfisher scan /path/to/code -## NOTE: This path can refer to: -# 1. a local git repo -# 2. a directory with many git repos -# 3. or just a folder with files and subdirectories - -## To explicitly prevent scanning git commit history add: -# `--git-history=none` -``` - -### Scan a directory containing multiple Git repositories - -```bash -kingfisher scan /projects/mono‑repo‑dir -``` - -### Scan a Git repository without validation - -```bash -kingfisher scan ~/src/myrepo --no-validate -``` - -### Display only secrets confirmed active by third‑party APIs - -```bash -kingfisher scan /path/to/repo --only-valid -``` - -### Output JSON and capture to a file - -```bash -kingfisher scan . --format json | tee kingfisher.json -``` - -### Output SARIF directly to disk - -```bash -kingfisher scan /path/to/repo --format sarif --output findings.sarif -``` - -### Pipe any text directly into Kingfisher by passing `-` - -```bash -cat /path/to/file.py | kingfisher scan - - -``` - -### Limit maximum file size scanned (`--max-file-size`) - -By default, Kingfisher skips files larger than **256 MB**. You can raise or lower this cap per run with `--max-file-size`, which takes a value in **megabytes**. - -```bash -# Scan files up to 500 mb in size -kingfisher scan /some/file --max-file-size 500 -``` - -### Scan using a rule _family_ with one flag - -_(prefix matching: `--rule kingfisher.aws` loads `kingfisher.aws.*`)_ - -```bash -# Only apply AWS-related rules (kingfisher.aws.1 + kingfisher.aws.2) -kingfisher scan /path/to/repo --rule kingfisher.aws -``` - -### Display rule performance statistics - -```bash -kingfisher scan /path/to/repo --rule-stats -``` - -### Scan while ignoring likely test files - -`--exclude` skips any file or directory whose path matches this glob pattern (repeatable, uses gitignore-style syntax, case sensitive) - -```bash -# Scan source but skip likely unit / integration tests -kingfisher scan ./my-project \ - --exclude='[Tt]est' \ - --exclude='spec' \ - --exclude='[Ff]ixture' \ - --exclude='example' \ - --exclude='sample' -``` - -### Exclude specific paths -```bash -# Skip all Python files and any directory named tests -kingfisher scan ./my-project \ - --exclude '*.py' \ - --exclude '[Tt]ests' -``` - -### Scan changes in CI pipelines - -Limit scanning to the delta between your default branch and a pull request branch by combining `--since-commit` with `--branch` (defaults to `HEAD`). This only scans files that differ between the two references, which keeps CI runs fast while still blocking new secrets. - -Use `--branch-root-commit` alongside `--branch` when you need to include a specific commit (and everything after it) in a diff-focused scan without re-examining earlier history. Provide the branch tip (or other comparison ref) via `--branch`, and pass the commit or merge-base you want to include with `--branch-root-commit`. If you omit `--branch-root-commit`, you can still enable `--branch-root` to fall back to treating the `--branch` ref itself as the inclusive root for backwards compatibility. This is especially useful in long-lived branches where you want to resume scanning from a previous review point or from the commit where a hotfix forked. - -> **How is this different from `--since-commit`?** -> `--since-commit` computes a diff between the branch tip and another ref, so it only inspects files that changed between those two points in history. `--branch-root-commit` rewinds to the parent of the commit you provide and then scans everything introduced from that commit forward, even if the files are unchanged relative to another baseline. Reach for `--since-commit` to keep CI scans fast by checking only the latest delta, and use `--branch-root-commit` when you want to re-audit the full contents of a branch starting at a specific commit. - -```bash -kingfisher scan . \ - --since-commit origin/main \ - --branch "$CI_BRANCH" -``` - -Another example: -```bash -cd /tmp -git clone https://github.com/micksmix/SecretsTest.git - -cd /tmp/SecretsTest -git checkout feature-1 -# -# scan diff between main and feature-1 branch -kingfisher scan /tmp/SecretsTest --branch feature-1 \ - --since-commit=$(git -C /tmp/SecretsTest merge-base main feature-1) -# -# scan only a specific commit -kingfisher scan /tmp/SecretsTest \ - --branch baba6ccb453963d3f6136d1ace843e48d7007c3f -# -# scan feature-1 starting at a specific commit (inclusive) -kingfisher scan /tmp/SecretsTest --branch feature-1 \ - --branch-root-commit baba6ccb453963d3f6136d1ace843e48d7007c3f -# -# scan feature-1 starting from the commit where the branch diverged from main -kingfisher scan /tmp/SecretsTest --branch feature-1 \ - --branch-root-commit $(git -C /tmp/SecretsTest merge-base main feature-1) -# -# scan from a hotfix commit that should be re-checked before merging -HOTFIX_COMMIT=$(git -C /tmp/SecretsTest rev-parse hotfix~1) -kingfisher scan /tmp/SecretsTest --branch hotfix \ - --branch-root-commit "$HOTFIX_COMMIT" -``` - -When the branch under test is already checked out, `--branch HEAD` or omitting `--branch` entirely is sufficient. Kingfisher exits with `200` when any findings are discovered and `205` when validated secrets are present, allowing CI jobs to fail automatically if new credentials slip in. - -> **Tip:** You can point Kingfisher at a local working tree and scan another branch or commit without changing checkouts. The CLI now resolves repositories from their worktree roots, so commands like the following work without needing to pass the `.git` directory explicitly: - -```bash -kingfisher scan /path/to/local/repo --branch -kingfisher scan C:\\src\\repo --branch -``` - -The same diff-focused workflow works when cloning repositories on the fly with `--git-url`. Kingfisher automatically tries remote-tracking names like `origin/main` and `origin/feature-1`, so you can target the branches involved in a pull request without performing a local checkout first. - -```bash -kingfisher scan \ - --git-url https://github.com/org/repo.git \ - --since-commit main \ - --branch development -``` - -When `--since-commit` is omitted, specifying `--branch` scans the requested ref directly. This makes it easy to analyze a feature branch without checking it out locally. - -```bash -# Scan a branch from an existing checkout -kingfisher scan ~/tmp/repo --branch feature-123 - -# Or scan a branch when cloning on the fly -kingfisher scan \ - --git-url https://github.com/org/repo.git \ - --branch origin/feature-123 -``` - -In CI systems that expose the base and head commits explicitly, you can pass those SHAs directly while still using `--git-url`: - -```bash -kingfisher scan \ - --git-url git@github.com:org/repo.git \ - --since-commit "$BASE_COMMIT" \ - --branch "$PR_HEAD_COMMIT" -``` - -If you want to know which files are being skipped, enable verbose debugging (-v) when scanning, which will report any files being skipped by the baseline file (or via --exclude): - -```bash -# Skip all Python files and any directory named tests, and report to stderr any skipped files -kingfisher scan ./my-project \ - --exclude '*.py' \ - --exclude tests \ - -v -``` - -## GitHub Scanning an AWS S3 Bucket -You can scan S3 objects directly: - -```bash -kingfisher scan s3 bucket-name [--prefix path/] -``` - -Credential resolution happens in this order: - -1. `KF_AWS_KEY` and `KF_AWS_SECRET` environment variables -2. `--profile` pointing to a profile in `~/.aws/config` (works with AWS SSO) -3. anonymous access for public buckets - -If `--role-arn` is supplied, the credentials from steps 1–2 are used to assume that role. - -Examples - -```bash -# using explicit keys -export KF_AWS_KEY=AKIA... -export KF_AWS_SECRET=g5nYW... -kingfisher scan s3 some-example-bucket - -# Above can also be run as: -KF_AWS_KEY=AKIA... KF_AWS_SECRET=g5nYW... kingfisher scan s3 some-example-bucket - -# using a local profile (e.g., SSO) that exists in your AWS profile (~/.aws/config) -kingfisher scan s3 some-example-bucket --profile default - -# anonymous scan of a bucket, while providing an object prefix to only scan subset of the s3 bucket -kingfisher scan s3 awsglue-datasets \ - --prefix examples/us-legislators/all - -# assuming a role when scanning -kingfisher scan s3 some-example-bucket \ - --role-arn arn:aws:iam::123456789012:role/MyRole - -# anonymous scan of a public bucket -kingfisher scan s3 some-example-bucket -``` - -Docker example: - -```bash -docker run --rm \ - -e KF_AWS_KEY=AKIA... \ - -e KF_AWS_SECRET=g5nYW... \ - ghcr.io/mongodb/kingfisher:latest \ - scan s3 bucket-name -``` - -## Google Cloud Storage Scanning a Google Cloud Storage Bucket - -Use the `gcs` scan subcommand to stream objects directly from Google Cloud Storage. Authentication -uses Application Default Credentials, so you can provide a service-account JSON file via the -`GOOGLE_APPLICATION_CREDENTIALS` environment variable or by passing `--service-account`. Public -buckets work without credentials. - -```bash -kingfisher scan gcs bucket-name - -# scan a sub-tree inside the bucket -kingfisher scan gcs bucket-name --prefix path/to/data/ - -# supply a service-account key explicitly -kingfisher scan gcs bucket-name --service-account /path/to/key.json -``` - -Functional example: -```bash -kingfisher scan gcs cloud-samples-data --prefix "storage/" -``` - - -## Docker Scanning Docker Images - -Kingfisher will first try to use any locally available image, then fall back to pulling via OCI. - -Authentication happens *in this order*: - -1. **`KF_DOCKER_TOKEN`** env var - - If it contains `user:pass`, it’s used as Basic auth - - Otherwise it’s sent as a Bearer token -2. **Docker CLI credentials** - - Checks `credHelpers` (per-registry) and `credsStore` in `~/.docker/config.json`. - - Falls back to the legacy `auths` → `auth` (base64) entries. -3. **Anonymous** (no credentials) - - -```bash -# 1) Scan public or already-pulled image -kingfisher scan docker ghcr.io/owasp/wrongsecrets/wrongsecrets-master:latest-master - -# 2) For private registries, explicitly set KF_DOCKER_TOKEN: -# - Basic auth: "user:pass" -# - Bearer only: "TOKEN" -export KF_DOCKER_TOKEN="AWS:$(aws ecr get-login-password --region us-east-1)" -kingfisher scan docker some-private-registry.dkr.ecr.us-east-1.amazonaws.com/base/amazonlinux2023:latest - -# 3) Or rely on your Docker CLI login/keychain: -# (e.g. aws ecr get-login-password … | docker login …) -kingfisher scan docker private.registry.example.com/my-image:tag -``` - -> **Deprecated** -> Legacy scan flags such as `--github-user`, `--gitlab-group`, -> `--bitbucket-workspace`, `--azure-organization`, `--huggingface-user`, -> `--slack-query`, `--jira-url`, `--confluence-url`, `--s3-bucket`, -> `--gcs-bucket`, and `--docker-image` still work for now, but they trigger a -> warning and will be removed in a future release. Migrate to the -> `kingfisher scan ` subcommands below to future-proof your automations. - -## GitHub Scanning GitHub - -### Scan GitHub organization (requires `KF_GITHUB_TOKEN`) - -```bash -kingfisher scan github --organization my-org -``` - -### Skip specific GitHub repositories during enumeration - -Repeat `--github-exclude` for every repository you want to ignore when scanning -users or organizations. You can provide exact repositories like -`OWNER/REPO` or gitignore-style glob patterns such as `owner/*-archive` -(matching is case-insensitive). - -```bash -kingfisher scan github --organization my-org \ - --github-exclude my-org/huge-repo \ - --github-exclude my-org/*-archive -``` - -### Scan remote GitHub repository - -`--git-url` clones the repository and scans its files and history. To also inspect -related server-side data, supply `--repo-artifacts`. This flag pulls down the -repository's issues (including pull requests), wiki, and any public gists owned by -the repository owner and scans them for secrets. Fetching these extras counts -against API rate limits and private artifacts require a `KF_GITHUB_TOKEN`. - -> **Why does `--git-url` sometimes report fewer findings than scanning a local checkout?**. -> -> Remote clones created via `--git-url` default to `--mirror`/bare mode so Kingfisher only -> reads the Git history. When you point Kingfisher at an existing working tree (for example -> `kingfisher scan ./repo`), it enumerates both the filesystem contents *and* the Git -> history. Any secrets that are present in the checked-out files therefore appear twice: -> once from the working tree path and once from the commit where the secret entered the -> history. To replicate the remote behavior locally, either scan a bare clone or disable -> history scanning with `--git-history none` when targeting a working tree. - - -```bash -# Scan the repository only -kingfisher scan --git-url https://github.com/org/repo.git - -# Include issues, wiki, and owner gists -kingfisher scan --git-url https://github.com/org/repo.git --repo-artifacts - -# Private repositories or artifacts -KF_GITHUB_TOKEN="ghp_…" kingfisher scan --git-url https://github.com/org/private_repo.git --repo-artifacts -``` - ---- - -## GitLab Scanning GitLab - -### Scan GitLab group (requires `KF_GITLAB_TOKEN`) - -```bash -kingfisher scan gitlab --group my-group -# include repositories from all nested subgroups -kingfisher scan gitlab --group my-group --include-subgroups -``` - -### Scan GitLab user - -```bash -kingfisher scan gitlab --user johndoe -``` - -### Skip specific GitLab projects during enumeration - -Repeat `--gitlab-exclude` for every project path you want to ignore when scanning -users or groups. Specify project paths as `group/project` (case-insensitive) or -use gitignore-style glob patterns like `group/**/archive-*` to drop families of -projects across nested subgroups. - -```bash -kingfisher scan gitlab --group my-group \ - --gitlab-exclude my-group/huge-project \ - --gitlab-exclude my-group/**/archive-* -``` - -### Scan remote GitLab repository by URL - -`--git-url` by itself clones the project repository. To include server-side -artifacts owned by the project, add `--repo-artifacts`. Kingfisher will retrieve -the project's issues, wiki, and snippets and scan them for secrets. These extra -requests may take longer and require a `KF_GITLAB_TOKEN` for private projects. - -```bash -# Scan the repository only -kingfisher scan --git-url https://gitlab.com/group/project.git - -# Include issues, wiki, and snippets -kingfisher scan --git-url https://gitlab.com/group/project.git --repo-artifacts - -# Private projects or artifacts -KF_GITLAB_TOKEN="glpat-…" kingfisher scan --git-url https://gitlab.com/group/private_project.git --repo-artifacts -``` - -### List GitLab repositories - -```bash -kingfisher scan gitlab --group my-group --list-only -# include repositories from all nested subgroups -kingfisher scan gitlab --group my-group --include-subgroups --list-only -# skip specific projects when listing or scanning (supports glob patterns) -kingfisher scan gitlab --group my-group --gitlab-exclude my-group/**/legacy-* --list-only -``` -## Azure Repos Scanning Azure Repos - -### Scan Azure Repos organization or collection (requires `KF_AZURE_TOKEN` or `KF_AZURE_PAT`) - -```bash -kingfisher scan azure --organization my-org - -# Azure Repos Server example -KF_AZURE_PAT="pat" kingfisher scan azure --organization DefaultCollection --azure-base-url https://ado.internal.example/tfs/ -``` - -### Scan specific Azure Repos projects - -Projects are specified as `ORGANIZATION/PROJECT`. Repeat the flag for multiple projects. - -```bash -kingfisher scan azure --project my-org/payments \ - --project my-org/core-platform -``` - -### Skip specific Azure repositories during enumeration - -Repeat `--azure-exclude` to ignore repositories when scanning organizations or projects. -Use identifiers like `ORGANIZATION/PROJECT/REPOSITORY`. Repositories that share the same -name as their project can be excluded with `ORGANIZATION/PROJECT`, and gitignore-style -patterns such as `my-org/*/archive-*` are also supported. - -```bash -kingfisher scan azure --organization my-org \ - --azure-exclude my-org/payments/legacy-service \ - --azure-exclude my-org/**/archive-* -``` - -### List Azure repositories - -```bash -kingfisher scan azure --organization my-org --list-only -# list repositories for specific projects -kingfisher scan azure --project my-org/app --project my-org/api --list-only -# skip specific repositories while listing (supports glob patterns) -kingfisher scan azure --organization my-org --azure-exclude my-org/**/experimental-* --list-only -``` -## Gitea Scanning Gitea - -### Scan Gitea organization (requires `KF_GITEA_TOKEN`) - -```bash -kingfisher scan gitea --organization my-org -# self-hosted example -KF_GITEA_TOKEN="gtoken" kingfisher scan gitea --organization platform --gitea-api-url https://gitea.internal.example/api/v1/ -``` - -### Scan Gitea user - -```bash -kingfisher scan gitea --user johndoe -``` - -### Skip specific Gitea repositories during enumeration - -Repeat `--gitea-exclude` for each repository you want to ignore when scanning users -or organizations. Accepts `owner/repo` identifiers or gitignore-style glob patterns -like `team/**/archive-*`. - -```bash -kingfisher scan gitea --organization my-org \ - --gitea-exclude my-org/legacy-repo \ - --gitea-exclude my-org/**/archive-* -``` - -### Scan remote Gitea repository by URL - -`--git-url` clones the repository and scans its history. Adding `--repo-artifacts` -also clones the repository wiki if one exists. Private repositories and wikis -require `KF_GITEA_TOKEN` (and `KF_GITEA_USERNAME` when cloning via HTTPS). - -```bash -# Scan the repository only -kingfisher scan --git-url https://gitea.com/org/repo.git - -# Include the repository wiki (if present) -KF_GITEA_TOKEN="gtoken" KF_GITEA_USERNAME="org" \ - kingfisher scan --git-url https://gitea.com/org/repo.git --repo-artifacts -``` - -### List Gitea repositories - -```bash -kingfisher scan gitea --organization my-org --list-only -# enumerate every organization visible to the authenticated user -KF_GITEA_TOKEN="gtoken" kingfisher scan gitea --all-gitea-organizations --list-only -# self-hosted example -KF_GITEA_TOKEN="gtoken" kingfisher scan gitea --user johndoe --gitea-api-url https://gitea.internal.example/api/v1/ --list-only -``` -## Bitbucket Scanning Bitbucket -### Scan Bitbucket workspace - -```bash -kingfisher scan bitbucket --workspace my-team -# include Bitbucket Cloud repositories from every accessible workspace -KF_BITBUCKET_USERNAME="$USER" KF_BITBUCKET_APP_PASSWORD="$APP_PASSWORD" \ - kingfisher scan bitbucket --all-workspaces -``` - -### Scan Bitbucket user - -```bash -kingfisher scan bitbucket --user johndoe -``` - -### Skip specific Bitbucket repositories during enumeration - -Use `--bitbucket-exclude` to ignore repositories while scanning users, workspaces, -or projects. Patterns accept either `owner/repo` (case-insensitive) or -gitignore-style globs such as `workspace/**/archive-*`. - -```bash -kingfisher scan bitbucket --workspace my-team \ - --bitbucket-exclude my-team/legacy-repo \ - --bitbucket-exclude my-team/**/archive-* -``` - -### Scan remote Bitbucket repository by URL - -`--git-url` clones the repository and scans its files and history. To inspect -Bitbucket artifacts such as issues, add `--repo-artifacts`. Private artifacts -require credentials (see [Authenticate to Bitbucket](#authenticate-to-bitbucket)). - -```bash -# Scan the repository only -kingfisher scan --git-url https://bitbucket.org/hashashash/secretstest.git - -# Include repository issues -KF_BITBUCKET_USERNAME="user" \ -KF_BITBUCKET_APP_PASSWORD="app-password" \ - kingfisher scan --git-url https://bitbucket.org/workspace/project.git --repo-artifacts -``` - -### List Bitbucket repositories - -```bash -kingfisher scan bitbucket --workspace my-team --list-only -# enumerate all accessible workspaces or projects -KF_BITBUCKET_USERNAME="$USER" KF_BITBUCKET_APP_PASSWORD="$APP_PASSWORD" \ - kingfisher scan bitbucket --all-workspaces --list-only -# filter out repositories using glob patterns -kingfisher scan bitbucket --workspace my-team --bitbucket-exclude my-team/**/experimental-* --list-only -``` - -### Authenticate to Bitbucket - -Kingfisher supports Bitbucket Cloud and Bitbucket Server credentials: - -- **App password or server token** – set `KF_BITBUCKET_USERNAME` and either - `KF_BITBUCKET_APP_PASSWORD`, `KF_BITBUCKET_TOKEN`, or - `KF_BITBUCKET_PASSWORD`. -- **OAuth/PAT token** – set `KF_BITBUCKET_OAUTH_TOKEN`. - -These credentials match the options described in the [ghorg setup -guide](https://github.com/gabrie30/ghorg/blob/master/README.md#bitbucket-setup). - -### Self-hosted Bitbucket Server - -Use `--bitbucket-api-url` to point Kingfisher at your server's REST endpoint, for example -`https://bitbucket.example.com/rest/api/1.0/`. Provide credentials with -`KF_BITBUCKET_USERNAME` plus either `KF_BITBUCKET_TOKEN` or `KF_BITBUCKET_PASSWORD`, -and pass `--ignore-certs` when connecting to HTTP or otherwise insecure instances. -## Hugging Face Scanning Hugging Face - -Hugging Face hosts git repositories for models, datasets, and Spaces. Kingfisher can enumerate and scan all three resource types. - -### Scan Hugging Face user - -```bash -kingfisher scan huggingface --user -``` - -### Scan Hugging Face organization - -```bash -kingfisher scan huggingface --organization -``` - -### Scan specific Hugging Face resources - -Scan individual repositories by ID (owner/name) or by passing the full HTTPS URL: - -```bash -kingfisher scan huggingface --model -kingfisher scan huggingface --dataset https://huggingface.co/datasets// -kingfisher scan huggingface --space -``` - -Use `--huggingface-exclude` to omit results returned by user or organization enumeration. Prefix values with `model:`, `dataset:`, or `space:` when you only want to skip a specific resource type. - -### List Hugging Face repositories - -```bash -kingfisher scan huggingface --user --list-only -``` - -### Authenticate to Hugging Face - -Private repositories require an access token provided through the `KF_HUGGINGFACE_TOKEN` environment variable. For git authentication the helper also honours `KF_HUGGINGFACE_USERNAME` (default `hf_user`). - -## Jira Scanning Jira - -### Scan Jira issues matching a JQL query - -```bash -KF_JIRA_TOKEN="token" kingfisher scan jira --url https://jira.company.com \ - --jql "project = TEST AND status = Open" \ - --max-results 500 -``` - -### Scan the last 1,000 Jira issues: -```bash -KF_JIRA_TOKEN="token" kingfisher scan jira --url https://jira.mongodb.org \ - --jql 'ORDER BY created DESC' \ - --max-results 1000 -``` - -## Confluence Scanning Confluence -### Scan Confluence pages matching a CQL query - -```bash -# Bearer token -KF_CONFLUENCE_TOKEN="token" kingfisher scan confluence --url https://confluence.company.com \ - --cql "label = secret" \ - --max-results 500 - -# Basic auth with username and token -KF_CONFLUENCE_USER="user@example.com" KF_CONFLUENCE_TOKEN="token" \ - kingfisher scan confluence --url https://confluence.company.com \ - --cql "text ~ 'password'" \ - --max-results 500 -``` - -Use the base URL of your Confluence site for `--confluence-url`. Kingfisher -automatically adds `/rest/api` to the end, so `https://example.com/wiki` and -`https://example.com` both work depending on your server configuration. - -Generate a personal access token and set it in the `KF_CONFLUENCE_TOKEN` environment variable. By default, Kingfisher sends the token as a bearer token in the `Authorization` header. - -To use basic authentication instead, also set `KF_CONFLUENCE_USER` to your Confluence email address; Kingfisher will then send the username and `KF_CONFLUENCE_TOKEN` as a Basic auth header. If the server responds with a redirect to a login page, the credentials are invalid or lack the required permissions. - -## Slack Scanning Slack -### Scan Slack messages matching a search query - -```bash -KF_SLACK_TOKEN="xoxp-1234..." kingfisher scan slack "from:username has:link" \ - --max-results 1000 - -KF_SLACK_TOKEN="xoxp-1234..." kingfisher scan slack "akia" \ - --max-results 1000 -``` -*The Slack token must be a user token with the `search:read` scope. Bot tokens (those beginning with `xoxb-`) cannot call the Slack search API.* - -## Environment Variables for Tokens - -| Variable | Purpose | -| ----------------- | ---------------------------- | -| `KF_GITHUB_TOKEN` | GitHub Personal Access Token | -| `KF_GITLAB_TOKEN` | GitLab Personal Access Token | -| `KF_GITEA_TOKEN` | Gitea Personal Access Token | -| `KF_GITEA_USERNAME` | Username for private Gitea clones (used with `KF_GITEA_TOKEN`) | -| `KF_AZURE_TOKEN` / `KF_AZURE_PAT` | Azure Repos Personal Access Token | -| `KF_AZURE_USERNAME` | Username to use with Azure Repos PATs (defaults to `pat` when unset) | -| `KF_BITBUCKET_USERNAME` | Bitbucket username for basic authentication | -| `KF_BITBUCKET_APP_PASSWORD` / `KF_BITBUCKET_TOKEN` | Bitbucket app password or server token | -| `KF_BITBUCKET_OAUTH_TOKEN` | Bitbucket OAuth or PAT token | -| `KF_HUGGINGFACE_TOKEN` | Hugging Face access token for API enumeration and git cloning | -| `KF_HUGGINGFACE_USERNAME` | Optional username for Hugging Face git operations (defaults to `hf_user`) | -| `KF_JIRA_TOKEN` | Jira API token | -| `KF_CONFLUENCE_TOKEN` | Confluence API token | -| `KF_SLACK_TOKEN` | Slack API token | -| `KF_DOCKER_TOKEN` | Docker registry token (`user:pass` or bearer token). If unset, credentials from the Docker keychain are used | -| `KF_AWS_KEY` and `KF_AWS_SECRET` | AWS Credentials to use with S3 bucket scanning | - -Set them temporarily per command: - -```bash -KF_GITLAB_TOKEN="glpat-…" kingfisher scan gitlab --group my-group -``` - -Or export for the session: - -```bash -export KF_GITLAB_TOKEN="glpat-…" -``` - -To authenticate Jira requests: -```bash -export KF_JIRA_TOKEN="token" -``` - -To authenticate Confluence requests: -```bash -export KF_CONFLUENCE_TOKEN="token" -``` - -_If no token is provided Kingfisher still works for public repositories._ - ---- - -## Exit Codes - -| Code | Meaning | -| ---- | ----------------------------- | -| 0 | No findings | -| 200 | Findings discovered | -| 205 | Validated findings discovered | - -## Update Checks - -Kingfisher automatically queries GitHub for a newer release when it starts and tells you whether an update is available. - -- **Hands-free updates** – Add `--self-update` to any Kingfisher command - - * If a newer version exists, Kingfisher will download it, replace the running binary, and re-launch itself with the **exact same arguments**. - * If the update fails or no newer release is found, the current run proceeds as normal - -- **Manual update** – Run `kingfisher self-update` to update the binary without scanning - -- **Disable version checks** – Pass `--no-update-check` to skip both the startup and shutdown checks entirely - -# 🤓 Advanced Options - -## Build a Baseline / Detect New Secrets - -There are situations where a repository already contains checked‑in secrets, but you want to ensure no **new** secrets are introduced. A baseline file lets you document the known findings so future scans only report anything that is not already in that list. - -The easiest way to create a baseline is to run a normal scan with the `--manage-baseline` flag (typically at a low confidence level to capture all potential matches): - -```bash -kingfisher scan /path/to/code \ - --confidence low \ - --manage-baseline \ - --baseline-file ./baseline-file.yml -``` - -`--manage-baseline` automatically enables `--no-dedup` so the baseline captures every individual occurrence. - -Use the same YAML file with the `--baseline-file` option on future scans to hide all recorded findings: - -```bash -kingfisher scan /path/to/code \ - --baseline-file /path/to/baseline-file.yaml -``` - -Running the scan again with `--manage-baseline` refreshes the baseline by adding new findings and pruning entries for secrets that no longer appear. See [docs/BASELINE.md](docs/BASELINE.md) for full detail. - -## List Builtin Rules - -```bash -kingfisher rules list -``` - -## To scan using **only** your own `my_rules.yaml` you could run: - -```bash -kingfisher scan \ - --load-builtins=false \ - --rules-path path/to/my_rules.yaml \ - ./src/ -``` - -## To add your rules alongside the built‑ins: - -```bash -kingfisher scan \ - --rules-path ./custom-rules/ \ - --rules-path my_rules.yml \ - ~/path/to/project-dir/ -``` - -## Other Examples - -```bash -# Check custom rules - this ensures all regular expressions compile, and can match the rule's `examples` in the YML file -kingfisher rules check --rules-path ./my_rules.yml - -# List GitHub repos -kingfisher scan github --user my-user --list-only -kingfisher scan github --organization my-org --list-only -# Skip specific repositories when listing or scanning (supports glob patterns) -kingfisher scan github --organization my-org --github-exclude my-org/*-archive --list-only - -``` - -## Customize the HTTP User-Agent - -Kingfisher identifies its HTTP requests with a user-agent that includes the binary name and version followed by a browser-style -string. Some environments require extra context, such as a contact address, a change-ticket number, or a temporary test label. -Use the global `--user-agent-suffix` flag to append this information between the Kingfisher identifier and the browser portion: - -```bash -# Attach a contact email to all outbound validation requests -kingfisher --user-agent-suffix "contact=security@example.com" scan path/ - -# Label a one-off experiment -kingfisher --user-agent-suffix "Sept 2025 testing" scan github --user my-user --list-only -``` - -When omitted, Kingfisher defaults to `kingfisher/ Mozilla/5.0 ...`. The suffix is trimmed; passing an empty string -leaves the default unchanged. - -## Notable Scan Options - -- `--no-dedup`: Report every occurrence of a finding (disable the default de-duplicate behavior) -- `--no-base64`: By default, Kingfisher finds and decodes base64 blobs and scans them for secrets. This adds a slight performance overhead; use this flag to disable -- `--confidence `: (low|medium|high) -- `--min-entropy `: Override default threshold -- `--no-binary`: Skip binary files -- `--no-extract-archives`: Do not scan inside archives -- `--extraction-depth `: Specifies how deep nested archives should be extracted and scanned (default: 2) -- `--redact`: Replaces discovered secrets with a one-way hash for secure output -- `--exclude `: Skip any file or directory whose path matches this glob pattern (repeatable, uses gitignore-style syntax, case sensitive) -- `--baseline-file `: Ignore matches listed in a baseline YAML file -- `--manage-baseline`: Create or update the baseline file with current findings (automatically enables `--no-dedup`) -- `--skip-regex `: Ignore findings whose text matches this regex (repeatable) -- `--skip-word `: Ignore findings containing this case-insensitive word (repeatable) -- `--skip-aws-account `: Skip live AWS validation for findings tied to the specified AWS account number (repeatable, accepts comma-separated lists) -- `--skip-aws-account-file `: Load AWS account numbers to skip from a file (one account per line; `#` comments allowed) -- `--ignore-comment `: Honor additional inline directives from other scanners (repeatable; e.g. `--ignore-comment "gitleaks:allow"`) -- `--no-ignore`: Disable inline directives entirely so every match is reported -- `--no-ignore-if-contains`: Ignore the `ignore_if_contains` filter in rules so placeholder words still produce findings - -## Understanding `--confidence` - -The `--confidence` flag sets a minimum confidence threshold, not an exact match. - -- If you pass `--confidence medium`, findings with **medium and higher** confidence (medium + high) will be included. -- If you pass `--confidence low`, you’ll see **all levels** (low, medium, high). - - -### Ignore known false positives - -Use `--skip-regex` and `--skip-word` to suppress findings you know are benign. Both flags may be provided multiple times and are tested against the secret value **and** the full match context. - -With `--skip-regex`, these should be Rust compatible regular expressions, which you can test out at [regex101](https://regex101.com) - -```bash -# Skip any finding where the finding mentions TEST_KEY -kingfisher scan --skip-regex '(?i)TEST_KEY' path/ - -# Skip findings that contain the word "dummy" anywhere in the match -kingfisher scan --skip-word dummy path/ - -# Combine multiple patterns -kingfisher scan \ - --skip-regex 'AKIA[0-9A-Z]{16}' \ - --skip-word placeholder \ - --skip-word dummy \ - path/ -``` - -If a `--skip-regex` regular expression fails to compile, the scan aborts with an error so that typos are caught early. - -### Skip Canary Tokens (AWS) - -Canary/honey tokens are intentionally leaked credentials used to catch misuse. Kingfisher can **recognize and skip** known AWS canary accounts so hygiene scans don’t set off alerts. - -**How to skip** -Pass the 12-digit AWS account IDs for your canaries via `--skip-aws-account` (comma-separated) or `--skip-aws-account-file` (one ID per line; blank lines and `#` comments allowed). Kingfisher also ships with a **pre-seeded (but not exhaustive)** list of Thinkst Canary account IDs used by canarytokens.org, so many are skipped automatically. - -```bash -kingfisher scan /path/to/code \ - --skip-aws-account "171436882533,534261010715" - -# or combine preloaded canary IDs with a just-created decoy account -printf '999900001111 \n534261010715' > /tmp/canary_accounts.txt - -kingfisher scan /path/to/repo \ - --skip-aws-account-file /tmp/canary_accounts.txt - -``` - -**What you’ll see** -Findings tied to a skip-listed account report `Validation: Not Attempted` and note in the `Response:` that the entry came from the skip list: - -```bash -AWS SECRET ACCESS KEY => [KINGFISHER.AWS.2] - |Finding.......: - |Fingerprint...: 2141074333616819500 - |Confidence....: medium - |Entropy.......: 5.00 - |Validation....: Not Attempted - |__Response....: (skip list entry) AWS validation not attempted for account 171436882533. - |Language......: Unknown - |Line Num......: 21 - |Path..........: /tmp/test_canary_accounts.log -``` - -**Why this matters** -Skipping prevents noisy tripwires in prod telemetry while keeping the status explicit—“Not Attempted” isn’t a pass. If needed, verify these credentials out-of-band or with a safe, non-triggering method. - - -#### Common CLI flows - -```bash -# Skip a few in-house canaries during a filesystem scan -kingfisher scan repo/ \ - --skip-aws-account "111122223333,444455556666" - -# Read a longer list from disk -kingfisher scan repo/ \ - --skip-aws-account-file /tmp/scripts/canary_accounts.txt - -# Combine preloaded canary IDs with a just-created decoy account -printf '999900001111\n534261010715\n' > /tmp/new_canary.txt - -kingfisher scan /path/to/repo \ - --skip-aws-account-file /tmp/new_canary.txt - -``` - -Tip: if you manage multiple canary fleets (Thinkst, self-hosted alternatives, or bespoke decoys), checkpoint the account IDs alongside your infrastructure-as-code so security teams can rotate or expand the skip list without editing pipelines. - -### Inline ignore directives - -Add `kingfisher:ignore` anywhere on the same line as a finding to silence it. Multi-line strings and PEM-style blocks may also be ignored by placing the directive on the closing delimiter line (for example, `""" # kingfisher:ignore`), on the next logical line after the string, **or** on a comment immediately before the value: - -```python -# kingfisher:ignore -API_KEY = """ -line 1 -line 2 -""" -# kingfisher:ignore -``` - -Kingfisher searches the surrounding lines for these tokens without requiring language-specific comment markers. To reuse existing inline directives from other scanners, add them with repeatable `--ignore-comment` flags (for example `--ignore-comment "gitleaks:allow" --ignore-comment "NOSONAR"`). Use `--no-ignore` when you want to disable inline suppressions entirely. - - -## Finding Fingerprint - -The document below details the four-field formula (rule SHA-1, origin label, start & end offsets) hashed with XXH3-64 to create Kingfisher’s 64-bit finding fingerprint, and explains how this ID powers safe deduplication; plus how `--no-dedup` can be used shows every raw match. -See ([docs/FINGERPRINT.md](docs/FINGERPRINT.md)) - -## Rule Performance Profiling - -Use `--rule-stats` to collect timing information for every rule. After scanning, the summary prints a **Rule Performance Stats** section showing how many matches each rule produced along with its slowest and average match times. Useful when creating rules or debugging rules. - -## CLI Options - -```bash -kingfisher scan --help -``` - - -## Lineage and Evolution - -Kingfisher began as an internal fork of Nosey Parker, used as a high-performance foundation for secret detection. - -Since then it has evolved far beyond that starting point, introducing live validation, hundreds of new rules, additional scan targets, and major architectural changes across nearly every subsystem. - -**Key areas of evolution** -- **Live validation** of detected secrets directly within rules -- **Hundreds of new built-in rules** and an expanded YAML rule schema -- **Baseline management** to suppress known findings over time -- **Tree-sitter parsing** layered on Hyperscan for language-aware detection -- **More scan targets** (GitLab, Bitbucket, Gitea, Jira, Confluence, Slack, S3, GCS, Docker, Hugging Face, etc.) -- **Compressed Files** scanning support added -- **New storage model** (in-memory + Bloom filter, replacing SQLite) -- **Unified workflow** with JSON/BSON/SARIF outputs -- **Cross-platform builds** for Linux, macOS, and Windows - -# Roadmap - -- More rules -- More targets -- Please file a [feature request](https://github.com/mongodb/kingfisher/issues), or open a PR, if you have features you'd like added - -# License - -[Apache2 License](LICENSE) diff --git a/README.md.rej b/README.md.rej deleted file mode 100644 index af82147..0000000 --- a/README.md.rej +++ /dev/null @@ -1,68 +0,0 @@ -@@ -311,54 +311,63 @@ - | **Dev & CI/CD** | GitHub/GitLab tokens, CircleCI, TravisCI, TeamCity, Docker Hub, npm, PyPI, and more | - | **Messaging & Comms** | Slack, Discord, Microsoft Teams, Twilio, Mailgun, SendGrid, Mailchimp, and more | - | **Databases & Data Ops** | MongoDB Atlas, PlanetScale, Postgres DSNs, Grafana Cloud, Datadog, Dynatrace, and more | - | **Payments & Billing** | Stripe, PayPal, Square, GoCardless, and more | - | **Security & DevSecOps** | Snyk, Dependency-Track, CodeClimate, Codacy, OpsGenie, PagerDuty, and more | - | **Misc. SaaS & Tools** | 1Password, Adobe, Atlassian/Jira, Asana, Netlify, Baremetrics, and more | - - ## 📝 Write Custom Rules! - - Kingfisher ships with hundreds of rules with HTTP and service‑specific validation checks (AWS, Azure, GCP, etc.) to confirm if a detected string is a live credential. - - However, you may want to add your own custom rules, or modify a detection to better suit your needs / environment. - - First, review [docs/RULES.md](/docs/RULES.md) to learn how to create custom Kingfisher rules. - - ### Pattern requirements and placeholder filtering - - Every rule can declare optional `pattern_requirements` to enforce additional character checks after a regex matches. Each field - is independent: - - - `min_digits`, `min_uppercase`, `min_lowercase`, and `min_special_chars` enforce complexity thresholds. - - `special_chars` lets you override the set of characters counted as "special" when `min_special_chars` is used. - - `ignore_if_contains` lists case-insensitive substrings that should cause a match to be discarded (for example, to drop - `test`, `demo`, or `localhost` values). Kingfisher still accepts the legacy `exclude_words` key as an alias when loading - existing rule files. -- --When a match is skipped because of `ignore_if_contains`, Kingfisher logs the event at the `DEBUG` level alongside the rule that --was evaluated. If you need to keep those matches for a particular scan, pass `--no-ignore-if-contains` to `kingfisher scan` to --disable the substring filter without editing any rule files. -+- `checksum` lets you compare an extracted portion of the match against a Liquid-rendered expectation. Provide `actual.template` -+ and `expected` Liquid snippets (with access to `{{ MATCH }}`, `{{ FULL_MATCH }}`, and any named capture as both its original -+ case and uppercase alias) and Kingfisher will skip the finding when the rendered values differ. Optional keys such as -+ `requires_capture` and `skip_if_missing` help you guard against legacy formats while onboarding the checksum-aware variant. -+ -+When a match is skipped because of `ignore_if_contains` or a checksum mismatch, Kingfisher logs the event at the `DEBUG` level -+alongside the rule that was evaluated. If you need to keep those matches for a particular scan, pass `--no-ignore-if-contains` -+to `kingfisher scan` to disable the substring filter without editing any rule files. Verbose mode (`-v`) will also show you the -+checksum mismatch lengths so you can confirm why a finding was suppressed. -+ -+To support checksum workflows, Kingfisher now includes Liquid helpers such as `suffix` (to slice characters from a match), -+`crc32` (to hash the body), and `base62` (to encode and pad the checksum). You can mix these filters with your own templates to -+mirror provider-specific checksum implementations. - - Once you've done that, you can provide your custom rules (defined in a YAML file) and provide it to Kingfisher at runtime --- no recompiling required! - - # 🎉 Usage - - ## Basic Examples - - > **Note**  `kingfisher scan` detects whether the input is a Git repository or a plain directory, no extra flags required. - - ### Scan with secret validation - - ```bash - kingfisher scan /path/to/code - ## NOTE: This path can refer to: - # 1. a local git repo - # 2. a directory with many git repos - # 3. or just a folder with files and subdirectories - - ## To explicitly prevent scanning git commit history add: - # `--git-history=none` - ``` - - ### Scan a directory containing multiple Git repositories - - ```bash From 76a74226664e337e753408437653543934c83b79 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Fri, 7 Nov 2025 18:49:03 -0800 Subject: [PATCH 13/19] Added checksum comparisons to pattern_requirements, new suffix, crc32, and base62 Liquid filters, and verbose logging so mismatched checksums are skipped with context rather than reported as findings. --- data/rules/npm.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/data/rules/npm.yml b/data/rules/npm.yml index e275e14..ae2b46d 100644 --- a/data/rules/npm.yml +++ b/data/rules/npm.yml @@ -5,11 +5,17 @@ rules: (?xi) \b ( - npm_[A-Z0-9]{36} + npm_(?P[A-Za-z0-9]{30})(?P[A-Za-z0-9]{6}) ) \b pattern_requirements: min_digits: 2 + checksum: + actual: + template: "{{ MATCH | suffix: 6 }}" + requires_capture: checksum + expected: "{{ BODY | crc32 | base62: 6 }}" + skip_if_missing: true references: - https://docs.npmjs.com/about-access-tokens - https://github.com/github/roadmap/issues/557 From 065b26a220bd16807355e1dbaa2dcf0ce06856a6 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Sat, 8 Nov 2025 10:48:00 -0800 Subject: [PATCH 14/19] fixing rules --- data/rules/aws.yml | 15 +++++++-------- data/rules/azurestorage.yml | 4 ++-- data/rules/confluent.yml | 5 ++--- data/rules/github.yml | 11 ++++++----- data/rules/gitlab.yml | 3 +-- data/rules/ibm.yml | 2 +- data/rules/mapbox.yml | 12 +++++++++--- data/rules/npm.yml | 2 +- data/rules/opsgenie.yml | 16 +++++++++++----- data/rules/pypi.yml | 3 +-- data/rules/recaptcha.yml | 1 - data/rules/sentry.yml | 15 +++++++-------- data/rules/telegram.yml | 2 +- src/decompress.rs | 4 ++-- src/validation.rs | 15 ++++++++++++--- tests/int_base64.rs | 14 +++++++------- tests/int_slack.rs | 2 +- tests/smoke_archive.rs | 2 +- tests/smoke_baseline.rs | 2 +- tests/smoke_exclude.rs | 2 +- tests/smoke_fs.rs | 2 +- tests/smoke_git.rs | 6 +++--- 22 files changed, 78 insertions(+), 62 deletions(-) diff --git a/data/rules/aws.yml b/data/rules/aws.yml index dda39b2..6cd938a 100644 --- a/data/rules/aws.yml +++ b/data/rules/aws.yml @@ -22,23 +22,22 @@ rules: pattern: | (?xi) (?: - (?:[^A-Za-z0-9/+=]) + \b (?:AWS|AMAZON|AMZN|A3T[A-Z0-9]|AKIA|AGPA|AIDA|AROA|AIPA|ANPA|ANVA|ASIA) (?:.|[\n\r]){0,64}? - (?:[^A-Za-z0-9/+=]) - ([A-Za-z0-9+]{40}) - (?:[^A-Za-z0-9/+=]) + \b + ([A-Za-z0-9/+]{40}) + \b | - (?:[^A-Za-z0-9/+=]) - (?:AWS|AMAZON|AMZN|A3T[A-Z0-9]|AKIA|AGPA|AIDA|AROA|AIPA|ANPA|ANVA|ASIA) + \b(?:AWS|AMAZON|AMZN|A3T[A-Z0-9]|AKIA|AGPA|AIDA|AROA|AIPA|ANPA|ANVA|ASIA) (?:.|[\n\r]){0,96}? (?:SECRET|PRIVATE|ACCESS) (?:.|[\n\r]){0,16}? (?:KEY|TOKEN) (?:.|[\n\r]){0,64}? \b - ([A-Za-z0-9+]{40}) - (?:[^A-Za-z0-9/+=]) + ([A-Za-z0-9/+]{40}) + \b ) pattern_requirements: min_digits: 2 diff --git a/data/rules/azurestorage.yml b/data/rules/azurestorage.yml index f577194..1fe3c44 100644 --- a/data/rules/azurestorage.yml +++ b/data/rules/azurestorage.yml @@ -28,7 +28,7 @@ rules: id: kingfisher.azurestorage.2 pattern: | (?xi) - (?:[^A-Za-z0-9/+=]) + \b azure (?:.|[\n\r]){0,128}? (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN) @@ -51,7 +51,7 @@ rules: confidence: medium examples: - Azure AccountKey=Xy9aB8cD7eF6gH5iJ4kL3mN2oP1qR0sT9uV8wX7yZ6aB5cD4eF3gH2iJ1kL0mN9oP8qR7sT6uV5wX4yZ3aB2cD1q - - Azure AccountKey=Xy9aB8cD7eF6gH5iJ4kL3mN2oP1qR0sT9uV8wX7yZ6aB5cD4eF3gH2iJ1kL0mN9oP8qR7sT6uV5wX4yZ3aB2cD1g==\ + - Azure AccountKey=Ky7aC1cD7eF6gH5iJ4kL3mN2oP1qR0sT9uV8wX7yZ6aB5cD4eF3gH2iJ1kL0mN9oP8qR7sT6uV5wX4yZ3aB2cD1g==\ validation: type: AzureStorage depends_on_rule: diff --git a/data/rules/confluent.yml b/data/rules/confluent.yml index a02b051..ce8b003 100644 --- a/data/rules/confluent.yml +++ b/data/rules/confluent.yml @@ -32,12 +32,11 @@ rules: ( [A-Z0-9\+/]{64} ) - (?:[^A-Za-z0-9/+=]) min_entropy: 3.3 confidence: medium examples: - - confluent secret=abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890ab - - kafka_token=ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890abcdefghijklmnopqrstuvwxyzAB + - confluent secret=cbadefghijklmnopqrstuvwxyzcbaDEFGHIJKLMNOPQRSTUVWXYZ3214567890ab + - kafka_token=cbaDEFGHIJKLMNOPQRSTUVWXYZ3214567890cbadefghijklmnopqrstuvwxyzAB references: - https://docs.confluent.io/cloud/current/api.html#tag/API-Keys-(iamv2)/operation/getIamV2ApiKey validation: diff --git a/data/rules/github.yml b/data/rules/github.yml index bf86cac..334c7ed 100644 --- a/data/rules/github.yml +++ b/data/rules/github.yml @@ -4,15 +4,16 @@ rules: pattern: | (?xi) ( - github_pat - [A-Z0-9]{80,84} + github_pat_ + [A-Z0-9_+]{82,84} ) + \b pattern_requirements: min_digits: 2 min_lowercase: 2 min_entropy: 3.5 examples: - - "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" + - "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" references: - https://docs.github.com/en/rest/users?apiVersion=2022-11-28 validation: @@ -247,11 +248,11 @@ rules: (?:.|[\n\r]){0,32}? (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN) (?:.|[\n\r]){0,32}? - (?:[^A-Za-z0-9/+=]) + \b ( [a-z0-9]{40} ) - (?:[^A-Za-z0-9/+=]) + \b depends_on_rule: - rule_id: "kingfisher.github.5" variable: GITHUB_CLIENT_ID diff --git a/data/rules/gitlab.yml b/data/rules/gitlab.yml index 1cb6bd2..96a98bc 100644 --- a/data/rules/gitlab.yml +++ b/data/rules/gitlab.yml @@ -86,11 +86,10 @@ rules: id: kingfisher.gitlab.3 pattern: | (?xi) - (?:[^A-Za-z0-9/+=]|\r\n|\\n) + \b ( glptt-[0-9a-f]{40} ) - \b pattern_requirements: min_digits: 2 examples: diff --git a/data/rules/ibm.yml b/data/rules/ibm.yml index 284e5fc..390d927 100644 --- a/data/rules/ibm.yml +++ b/data/rules/ibm.yml @@ -3,7 +3,7 @@ rules: id: kingfisher.ibm.1 pattern: | (?xi) - (?:[^A-Za-z0-9/+=]) + \b (?:ibm(?:cloud)?|bx) (?:.|[\n\r]){0,32}? (?:SECRET|PRIVATE|ACCESS|KEY|TOKEN) diff --git a/data/rules/mapbox.yml b/data/rules/mapbox.yml index 26ab432..313eb9e 100644 --- a/data/rules/mapbox.yml +++ b/data/rules/mapbox.yml @@ -1,7 +1,7 @@ rules: - name: Mapbox Public Access Token id: kingfisher.mapbox.1 - pattern: '(?i)(?s)mapbox.{0,30}(pk\.[a-z0-9\-+/=]{32,128}\.[a-z0-9\-+/=]{20,30})(?:[^a-z0-9\-+/=])' + pattern: '(?i)(?s)mapbox.{0,30}(pk\.[a-z0-9\-+/=]{32,128}\.[a-z0-9\-+/=]{20,30})\b' pattern_requirements: min_digits: 2 min_entropy: 3.3 @@ -28,7 +28,13 @@ rules: - name: Mapbox Secret Access Token id: kingfisher.mapbox.2 - pattern: '(?i)(?s)mapbox.{0,30}(sk\.[a-z0-9\-+/=]{32,128}\.[a-z0-9\-+/=]{20,30})(?:[^a-z0-9\-+/=])' + pattern: | + (?xi)(?s) + mapbox.{0,30} + ( + sk\.[a-z0-9\-+/=]{32,128}\.[a-z0-9\-+/=]{20,30} + ) + \b pattern_requirements: min_digits: 2 min_entropy: 3.3 @@ -54,7 +60,7 @@ rules: - name: Mapbox Temporary Access Token id: kingfisher.mapbox.3 - pattern: '(?i)(?s)mapbox.{0,30}(tk\.[a-z0-9\-+/=]{32,128}\.[a-z0-9\-+/=]{20,30})(?:[^a-z0-9\-+/=])' + pattern: '(?i)(?s)mapbox.{0,30}(tk\.[a-z0-9\-+/=]{32,128}\.[a-z0-9\-+/=]{20,30})\b' pattern_requirements: min_digits: 2 min_entropy: 3.3 diff --git a/data/rules/npm.yml b/data/rules/npm.yml index ae2b46d..6132d80 100644 --- a/data/rules/npm.yml +++ b/data/rules/npm.yml @@ -23,7 +23,7 @@ rules: min_entropy: 3.3 confidence: medium examples: - - 'npm_TCllNwh2WLQlMWVhybM1iQrsTj6rMQ0BOh6d' + - "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" validation: type: Http content: diff --git a/data/rules/opsgenie.yml b/data/rules/opsgenie.yml index 0e1c620..4369d2f 100644 --- a/data/rules/opsgenie.yml +++ b/data/rules/opsgenie.yml @@ -18,17 +18,23 @@ rules: min_entropy: 3.5 examples: - opsgenie_api_key = '12345678-9abc-def0-1234-56789abcdef0' + references: + - https://docs.opsgenie.com/docs/api-overview + - https://support.atlassian.com/security-and-access-policies/docs/send-alerts-to-opsgenie/ + - https://support.atlassian.com/opsgenie/docs/european-service-region/ validation: type: Http content: request: - headers: - Authorization: GenieKey {{ TOKEN }} method: GET url: https://api.opsgenie.com/v2/alerts + headers: + Authorization: "GenieKey {{ TOKEN }}" response_matcher: - report_response: true - - type: WordMatch - words: - - "Could not authenticate" + - type: StatusMatch + status: [401, 403] negative: true + - type: WordMatch + words: ["Could not authenticate", "is not valid"] + negative: true \ No newline at end of file diff --git a/data/rules/pypi.yml b/data/rules/pypi.yml index a0bd49d..1b441f9 100644 --- a/data/rules/pypi.yml +++ b/data/rules/pypi.yml @@ -3,11 +3,10 @@ rules: id: kingfisher.pypi.1 pattern: | (?xi) - \b ( pypi-AgEIcHlwaS5vcmc[A-Z0-9_-]{50,} ) - (?:[^A-Z0-9_-]) + \b pattern_requirements: min_digits: 2 min_entropy: 3.3 diff --git a/data/rules/recaptcha.yml b/data/rules/recaptcha.yml index a1d1631..0c40e91 100644 --- a/data/rules/recaptcha.yml +++ b/data/rules/recaptcha.yml @@ -9,7 +9,6 @@ rules: ( 6l[c-f][a-z0-9_-].{36} ) - (?:[^A-Za-z0-9/]) pattern_requirements: min_digits: 3 min_entropy: 3 diff --git a/data/rules/sentry.yml b/data/rules/sentry.yml index b53c8af..a53b647 100644 --- a/data/rules/sentry.yml +++ b/data/rules/sentry.yml @@ -12,14 +12,14 @@ rules: ( [a-f0-9]{64} ) - (?:[^A-Za-z0-9/+=]) + \b pattern_requirements: min_digits: 2 min_entropy: 3.5 confidence: medium examples: - - SENTRY_TOKEN=abcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcd - - '"sentry": "1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef"' + - SENTRY_TOKEN=cbadefcbadefcbadefcbadefcbadefcbadefcbadefcbadefcbadefcbadefcbad + - '"sentry-key": "3214567890cbadef3214567890cbadef3214567890cbadef3214567890cbadef"' references: - https://docs.sentry.io/api/auth/ validation: @@ -41,7 +41,6 @@ rules: id: kingfisher.sentry.2 pattern: | (?xi) - \b ( sntrys_eyJpYXQiO[a-zA-Z0-9+/]{10,200}(?:LCJyZWdpb25fdXJs|InJlZ2lvbl91cmwi|cmVnaW9uX3VybCI6)[a-zA-Z0-9+/]{10,200}={0,2}_[a-zA-Z0-9+/]{43} ) @@ -51,8 +50,8 @@ rules: min_entropy: 4.2 confidence: medium examples: - - sntrys_eyJpYXQiOjE2OTA4ODAwMDAsInJlZ2lvbl91cmwiOiJodHRwczovL3NlbnRyeS5pby9vcmdzL215LW9yZy8ifQ==_abcdefghijklmnopqrstuvwx1234567890abcdefabc - - sntrys_eyJpYXQiOiIxNjkwODgwMDAwIiwicmVnaW9uX3VybCI6Imh0dHBzOi8vc2VudHJ5LmlvLyJ9_abcdABCD1234567890abcdABCD1234567890abcdABC + - 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 + - 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 references: - https://docs.sentry.io/api/auth/ validation: @@ -84,8 +83,8 @@ rules: min_entropy: 3.5 confidence: medium examples: - - sntryu_abcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcd - - SNTRY_USER="sntryu_1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef" + - 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 + - SNTRY_USER="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" references: - https://docs.sentry.io/api/auth/ validation: diff --git a/data/rules/telegram.yml b/data/rules/telegram.yml index f77daf4..3f628a6 100644 --- a/data/rules/telegram.yml +++ b/data/rules/telegram.yml @@ -11,7 +11,7 @@ rules: : [A-Z0-9_-]{35} ) - (?:[^A-Za-z0-9/+=]) + \b pattern_requirements: min_digits: 2 confidence: medium diff --git a/src/decompress.rs b/src/decompress.rs index 46ae5d5..bef07c6 100644 --- a/src/decompress.rs +++ b/src/decompress.rs @@ -340,7 +340,7 @@ mod tests { fn smoke_decompress_tar_gz_archive() -> anyhow::Result<()> { let dir = tempdir()?; let tar_gz = dir.path().join("payload.tar.gz"); - let github_pat = "ghp_1wuHFikBKQtCcH3EB2FBUkyn8krXhP2qLqPa"; // this is not a real secret + let github_pat = "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"; // this is not a real secret // build payload.tar.gz containing secret.txt { @@ -393,7 +393,7 @@ mod tests { fn smoke_decompress_without_extract_archives() -> anyhow::Result<()> { let dir = tempdir()?; let tar_gz = dir.path().join("payload.tar.gz"); - let github_pat = "ghp_1wuHFikBKQtCcH3EB2FBUkyn8krXhP2qLqPa"; + let github_pat = "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"; // ── build payload.tar.gz containing secret.txt ────────────────────────────── { diff --git a/src/validation.rs b/src/validation.rs index 3d47d48..1c1491a 100644 --- a/src/validation.rs +++ b/src/validation.rs @@ -996,8 +996,14 @@ mod tests { let mut globals = Object::new(); populate_globals_from_captures(&mut globals, &captured_values); - assert_eq!(globals.get("TOKEN").map(|v| v.to_string()), Some("longervalue".to_string())); - assert_eq!(globals.get("BODY").map(|v| v.to_string()), Some("body".to_string())); + assert_eq!( + globals.get("TOKEN"), + Some(Value::scalar("longervalue")).as_ref() + ); + assert_eq!( + globals.get("BODY"), + Some(Value::scalar("body")).as_ref() + ); } #[test] @@ -1008,7 +1014,10 @@ mod tests { populate_globals_from_captures(&mut globals, &captured_values); assert!(globals.get("TOKEN").is_none()); - assert_eq!(globals.get("CHECKSUM").map(|v| v.to_string()), Some("123456".to_string())); + assert_eq!( + globals.get("CHECKSUM"), + Some(Value::scalar("123456")).as_ref() + ); } } diff --git a/tests/int_base64.rs b/tests/int_base64.rs index be06570..dfb0289 100644 --- a/tests/int_base64.rs +++ b/tests/int_base64.rs @@ -8,8 +8,8 @@ use tempfile::tempdir; fn detects_base64_encoded_secret() -> anyhow::Result<()> { let dir = tempdir()?; let file_path = dir.path().join("secret.txt"); - // Base64 for ghp_1wuHFikBKQtCcH3EB2FBUkyn8krXhP2qLqPa - let encoded = "Z2hwXzF3dUhGaWtCS1F0Q2NIM0VCMkZCVWt5bjhrclhoUDJxTHFQYQ=="; + // Base64 for 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 + let encoded = "Z2hwXzF3dUhGaWtCS1F0Q2NIM0VCMkZCVWt5bjhrclhoUDBNV0h4cw=="; fs::write(&file_path, encoded)?; Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) @@ -26,7 +26,7 @@ fn detects_base64_encoded_secret() -> anyhow::Result<()> { .assert() .code(200) .stdout( - predicate::str::contains("ghp_1wuHFikBKQtCcH3EB2FBUkyn8krXhP2qLqPa") + predicate::str::contains("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") .and(predicate::str::contains("\"encoding\": \"base64\"")), ); @@ -39,7 +39,7 @@ fn detects_base64_encoded_secret() -> anyhow::Result<()> { fn skips_base64_when_disabled() -> anyhow::Result<()> { let dir = tempdir()?; let file_path = dir.path().join("secret.txt"); - let encoded = "Z2hwXzF3dUhGaWtCS1F0Q2NIM0VCMkZCVWt5bjhrclhoUDJxTHFQYQ=="; + let encoded = "Z2hwXzF3dUhGaWtCS1F0Q2NIM0VCMkZCVWt5bjhrclhoUDBNV0h4cw=="; fs::write(&file_path, encoded)?; Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) @@ -92,8 +92,8 @@ fn no_base64_skips_empty_files() -> anyhow::Result<()> { fn detects_base64_in_code_with_tree_sitter() -> anyhow::Result<()> { let dir = tempdir()?; let file_path = dir.path().join("secret.py"); - // Base64 for ghp_1wuHFikBKQtCcH3EB2FBUkyn8krXhP2qLqPa - let encoded = "Z2hwXzF3dUhGaWtCS1F0Q2NIM0VCMkZCVWt5bjhrclhoUDJxTHFQYQ=="; + // Base64 for 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 + let encoded = "Z2hwXzF3dUhGaWtCS1F0Q2NIM0VCMkZCVWt5bjhrclhoUDBNV0h4cw=="; fs::write(&file_path, format!("token = \"{}\"\n", encoded))?; Command::new(assert_cmd::cargo::cargo_bin!("kingfisher")) @@ -110,7 +110,7 @@ fn detects_base64_in_code_with_tree_sitter() -> anyhow::Result<()> { .assert() .code(200) .stdout( - predicate::str::contains("ghp_1wuHFikBKQtCcH3EB2FBUkyn8krXhP2qLqPa") + predicate::str::contains("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") .and(predicate::str::contains("\"encoding\": \"base64\"")), ); diff --git a/tests/int_slack.rs b/tests/int_slack.rs index 27c5999..c3cacc3 100644 --- a/tests/int_slack.rs +++ b/tests/int_slack.rs @@ -159,7 +159,7 @@ async fn test_scan_slack_messages() -> Result<()> { "messages": { "matches": [{ "permalink": "https://example.slack.com/archives/C123/p1234", - "text": "This contains a github token ghp_1wuHFikBKQtCcH3EB2FBUkyn8krXhP2qLqPa", + "text": "This contains a github token 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", "ts": "1234.56", "channel": {"id": "C123", "name": "general"} }], diff --git a/tests/smoke_archive.rs b/tests/smoke_archive.rs index 1ee547e..3e08a0a 100644 --- a/tests/smoke_archive.rs +++ b/tests/smoke_archive.rs @@ -7,7 +7,7 @@ fn smoke_scan_tar_gz_archive() -> anyhow::Result<()> { let dir = tempfile::tempdir()?; let tar_gz = dir.path().join("payload.tar.gz"); - let github_pat = "ghp_1wuHFikBKQtCcH3EB2FBUkyn8krXhP2qLqPa"; + let github_pat = "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"; // --- build a payload.tar.gz ------------------------------------------------- { diff --git a/tests/smoke_baseline.rs b/tests/smoke_baseline.rs index 8f9a8f2..734682c 100644 --- a/tests/smoke_baseline.rs +++ b/tests/smoke_baseline.rs @@ -5,7 +5,7 @@ use clap::Parser; use predicates::prelude::*; use tempfile::tempdir; -const GH_PAT: &str = "ghp_1wuHFikBKQtCcH3EB2FBUkyn8krXhP2qLqPa"; +const GH_PAT: &str = "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"; #[test] fn manage_baseline_enables_no_dedup() -> anyhow::Result<()> { diff --git a/tests/smoke_exclude.rs b/tests/smoke_exclude.rs index 504ecb5..048c85f 100644 --- a/tests/smoke_exclude.rs +++ b/tests/smoke_exclude.rs @@ -4,7 +4,7 @@ use assert_cmd::Command; use predicates::prelude::*; use tempfile::tempdir; -const SECRET: &str = "ghp_1wuHFikBKQtCcH3EB2FBUkyn8krXhP2qLqPa"; +const SECRET: &str = "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"; #[test] fn exclude_pattern_hides_matches() -> anyhow::Result<()> { diff --git a/tests/smoke_fs.rs b/tests/smoke_fs.rs index 2e017fe..55c45fa 100644 --- a/tests/smoke_fs.rs +++ b/tests/smoke_fs.rs @@ -5,7 +5,7 @@ use assert_cmd::prelude::*; use predicates::prelude::*; use tempfile::tempdir; -const GITHUB_PAT: &str = "ghp_1wuHFikBKQtCcH3EB2FBUkyn8krXhP2qLqPa"; +const GITHUB_PAT: &str = "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"; #[test] fn smoke_scan_filesystem_text_and_binary() -> anyhow::Result<()> { diff --git a/tests/smoke_git.rs b/tests/smoke_git.rs index d1305fb..b260bb9 100644 --- a/tests/smoke_git.rs +++ b/tests/smoke_git.rs @@ -15,7 +15,7 @@ fn smoke_scan_git_history() -> anyhow::Result<()> { // commit v1 let file_path = repo_dir.join("config.yml"); - fs::write(&file_path, b"ghp_1wuHFikBKQtCcH3EB2FBUkyn8krXhP2qLqPa")?; + fs::write(&file_path, b"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")?; let mut idx = repo.index()?; idx.add_path(std::path::Path::new("config.yml"))?; let oid1 = idx.write_tree()?; @@ -23,7 +23,7 @@ fn smoke_scan_git_history() -> anyhow::Result<()> { repo.commit(Some("HEAD"), &sig, &sig, "init", &tree1, &[])?; // commit v2 (same leak, will test dedup) - fs::write(&file_path, b"ghp_1wuHFikBKQtCcH3EB2FBUkyn8krXhP2qLqPa # unchanged")?; + fs::write(&file_path, b"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 # unchanged")?; idx.add_path(std::path::Path::new("config.yml"))?; let oid2 = idx.write_tree()?; let tree2 = repo.find_tree(oid2)?; @@ -44,7 +44,7 @@ fn smoke_scan_git_history() -> anyhow::Result<()> { ]) .assert() .code(200) // ← kingfisher’s “findings present” status - .stdout(predicate::str::contains("ghp_1wuHFikBKQtCcH3EB2FBUkyn8krXhP2qLqPa")); + .stdout(predicate::str::contains("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")); dir.close()?; Ok(()) From 0c07f3729fc293cdd9e6abd98df214a382661319 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Sat, 8 Nov 2025 15:03:53 -0800 Subject: [PATCH 15/19] fixing rules --- src/validation.rs | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/src/validation.rs b/src/validation.rs index 1c1491a..f93ec11 100644 --- a/src/validation.rs +++ b/src/validation.rs @@ -996,14 +996,8 @@ mod tests { let mut globals = Object::new(); populate_globals_from_captures(&mut globals, &captured_values); - assert_eq!( - globals.get("TOKEN"), - Some(Value::scalar("longervalue")).as_ref() - ); - assert_eq!( - globals.get("BODY"), - Some(Value::scalar("body")).as_ref() - ); + assert_eq!(globals.get("TOKEN"), Some(Value::scalar("longervalue")).as_ref()); + assert_eq!(globals.get("BODY"), Some(Value::scalar("body")).as_ref()); } #[test] @@ -1014,10 +1008,7 @@ mod tests { populate_globals_from_captures(&mut globals, &captured_values); assert!(globals.get("TOKEN").is_none()); - assert_eq!( - globals.get("CHECKSUM"), - Some(Value::scalar("123456")).as_ref() - ); + assert_eq!(globals.get("CHECKSUM"), Some(Value::scalar("123456")).as_ref()); } } From 1ee9e804b0e415625879a392dc0e2b7d0807bf98 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Sat, 8 Nov 2025 16:01:58 -0800 Subject: [PATCH 16/19] updated confluent rule with a checksum. Added zuplo rule with a checksum --- data/rules/confluent.yml | 37 ++++++ data/rules/zuplo.yml | 22 ++++ docs/RULES.md | 6 +- src/liquid_filters.rs | 242 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 306 insertions(+), 1 deletion(-) create mode 100644 data/rules/zuplo.yml diff --git a/data/rules/confluent.yml b/data/rules/confluent.yml index ce8b003..49e9b69 100644 --- a/data/rules/confluent.yml +++ b/data/rules/confluent.yml @@ -52,6 +52,43 @@ rules: - 200 type: StatusMatch url: https://api.confluent.cloud/iam/v2/api-keys/{{ CLIENTID }} + depends_on_rule: + - rule_id: "kingfisher.confluent.1" + variable: CLIENTID + - name: Confluent API Secret - Updated Format + id: kingfisher.confluent.3 + pattern: | + (?xi) + \b + ( + cflt(?P[A-Za-z0-9\+/]{54})(?P[A-Za-z0-9\+/]{6}) + ) + pattern_requirements: + checksum: + actual: + template: "{{ MATCH | suffix: 6 }}" + requires_capture: checksum + expected: "{{ BODY | crc32_le_b64: 6 }}" + skip_if_missing: true + min_entropy: 3.3 + confidence: medium + examples: + - confluent secret=cfltqPLd2lLPAtWtHGNhN32WlZxoEj30pcg8mzaPlPJ937JlMa7n9YCRLooqgifw + references: + - https://docs.confluent.io/cloud/current/api.html#tag/API-Keys-(iamv2)/operation/getIamV2ApiKey + validation: + type: Http + content: + request: + headers: + Authorization: 'Basic {{ CLIENTID | append: ":" | append: TOKEN | b64enc }}' + method: GET + response_matcher: + - report_response: true + - status: + - 200 + type: StatusMatch + url: https://api.confluent.cloud/iam/v2/api-keys/{{ CLIENTID }} depends_on_rule: - rule_id: "kingfisher.confluent.1" variable: CLIENTID \ No newline at end of file diff --git a/data/rules/zuplo.yml b/data/rules/zuplo.yml new file mode 100644 index 0000000..22ed4c1 --- /dev/null +++ b/data/rules/zuplo.yml @@ -0,0 +1,22 @@ +rules: + - name: Zuplo API Key + id: kingfisher.zuplo.1 + pattern: | + (?xi) + \b + ( + zpka_(?P[a-z0-9]{32})_(?P[0-9a-f]{8}) + ) + pattern_requirements: + checksum: + actual: + template: "{{ CHECKSUM | downcase }}" + requires_capture: checksum + expected: "{{ BODY | crc32_hex }}" + min_entropy: 3.3 + confidence: medium + examples: + - zpka_3e6c4f7d39954ca29353b7ab88589b64_de26cd55 + - zpka_b3f94d8d3d4d4a6ea5c5b20d0a5bb407_18eb262b + references: + - https://zuplo.com/blog/api-key-authentication diff --git a/docs/RULES.md b/docs/RULES.md index 262f66e..9e4d8e5 100644 --- a/docs/RULES.md +++ b/docs/RULES.md @@ -119,11 +119,15 @@ Below is the complete list of Liquid filters available in Kingfisher, along with | `b64url_enc` | – | URL-safe Base64 (no padding). Useful for JWT headers & payloads. | `{{ TOKEN \| b64url_enc }}` | | `b64dec` | – | Decodes a Base64 string. | `{{ "aGVsbG8=" \| b64dec }}` | | `sha256` | – | Computes the SHA-256 hex digest of the input. | `{{ TOKEN \| sha256 }}` | -| `crc32` | – | Computes the CRC32 checksum of the input and returns a decimal value. | `{{ TOKEN \| crc32 }}` | +| `crc32` | – | Computes the CRC32 checksum of the input and returns a decimal value. | `{{ TOKEN \| crc32 }}` | +| `crc32_dec` | `digits` (integer, optional) | Computes the CRC32 checksum and returns the last `digits` decimal characters (zero-padded). Defaults to the full value when omitted. | `{{ TOKEN \| crc32_dec: 6 }}` | +| `crc32_hex` | `digits` (integer, optional) | Computes the CRC32 checksum and returns the last `digits` hexadecimal characters (zero-padded). Defaults to the full value when omitted. | `{{ TOKEN \| crc32_hex: 8 }}` | +| `crc32_le_b64` | `len` (integer, optional) | Computes the CRC32 checksum, encodes the little-endian bytes using Base64, and optionally truncates to the first `len` characters. | `{{ TOKEN \| crc32_le_b64: 6 }}` | | `hmac_sha1` | `key` (string) | Computes HMAC-SHA1 over the input, returns Base64-encoded result. | `{{ TOKEN \| hmac_sha1: "secret-key" }}` | | `hmac_sha256` | `key` (string) | Computes HMAC-SHA256 over the input, returns Base64-encoded result. | `{{ TOKEN \| hmac_sha256: "secret-key" }}` | | `hmac_sha384` | `key` (string) | Computes HMAC-SHA384 over the input, returns Base64-encoded result. | `{{ TOKEN \| hmac_sha384: "secret-key" }}` | | `random_string` | `len` (integer, optional) | Generates a cryptographically-secure random alphanumeric string of the specified length (default: 32). | `{{ "" \| random_string: 16 }}` | +| `prefix` | `len` (integer, optional) | Returns the first `len` characters from the string (default: full). | `{{ TOKEN \| prefix: 6 }}` | | `suffix` | `len` (integer, optional) | Returns the last `len` characters from the string (default: full). | `{{ TOKEN \| suffix: 6 }}` | | `base62` | `width` (integer, optional) | Encodes the input number as Base62, left-padding with zeros as needed. | `{{ TOKEN \| crc32 \| base62: 6 }}` | | `url_encode` | – | Percent-encodes the input according to RFC 3986. | `{{ TOKEN \| url_encode }}` | diff --git a/src/liquid_filters.rs b/src/liquid_filters.rs index 5f02b9b..9112fb6 100644 --- a/src/liquid_filters.rs +++ b/src/liquid_filters.rs @@ -309,6 +309,49 @@ impl Filter for Suffix { } } +#[derive(Debug, FilterParameters)] +struct PrefixArgs { + #[parameter(description = "Number of leading characters to keep", arg_type = "integer")] + len: Option, +} + +#[derive(Clone, ParseFilter, FilterReflection, Default)] +#[filter( + name = "prefix", + description = "Return the prefix (first N characters) of the provided string.", + parameters(PrefixArgs), + parsed(Prefix) +)] +pub struct PrefixFilter; + +#[derive(Debug, FromFilterParameters, Display_filter)] +#[name = "prefix"] +struct Prefix { + #[parameters] + args: PrefixArgs, +} + +impl Filter for Prefix { + fn evaluate(&self, input: &dyn ValueView, runtime: &dyn Runtime) -> Result { + let args = self.args.evaluate(runtime)?; + let text = input.to_kstr(); + let requested = args + .len + .and_then(|value| { + let scalar = Value::scalar(value); + value_to_usize(&scalar) + }) + .unwrap_or_else(|| text.len()); + if requested == 0 { + return Ok(Value::scalar(String::new())); + } + + let mut chars: Vec = text.chars().collect(); + chars.truncate(requested.min(chars.len())); + Ok(Value::scalar(chars.into_iter().collect::())) + } +} + #[derive(Debug, Clone, Default, FilterReflection, ParseFilter)] #[filter( name = "b64enc", @@ -387,6 +430,175 @@ static_filter!( } ); +#[derive(Debug, FilterParameters)] +struct Crc32DecArgs { + #[parameter( + description = "Number of trailing decimal digits to return (zero padded)", + arg_type = "integer" + )] + digits: Option, +} + +#[derive(Clone, ParseFilter, FilterReflection, Default)] +#[filter( + name = "crc32_dec", + description = "Compute the CRC32 and optionally return the last N decimal digits.", + parameters(Crc32DecArgs), + parsed(Crc32Dec) +)] +pub struct Crc32DecFilter; + +#[derive(Debug, FromFilterParameters, Display_filter)] +#[name = "crc32_dec"] +struct Crc32Dec { + #[parameters] + args: Crc32DecArgs, +} + +impl Filter for Crc32Dec { + fn evaluate(&self, input: &dyn ValueView, runtime: &dyn Runtime) -> Result { + let args = self.args.evaluate(runtime)?; + let mut hasher = Hasher::new(); + hasher.update(input.to_kstr().as_bytes()); + let checksum = u128::from(hasher.finalize()); + + let digits = args + .digits + .and_then(|value| { + let scalar = Value::scalar(value); + value_to_usize(&scalar) + }) + .unwrap_or(0); + + if digits == 0 { + return Ok(Value::scalar(checksum.to_string())); + } + + let clamped_digits = digits.min(38); // 10^38 fits within u128 + let modulus = 10u128.pow(clamped_digits as u32); + let truncated = checksum % modulus; + let mut value = truncated.to_string(); + if clamped_digits > value.len() { + let mut padded = String::with_capacity(clamped_digits); + for _ in 0..(clamped_digits - value.len()) { + padded.push('0'); + } + padded.push_str(&value); + value = padded; + } + + Ok(Value::scalar(value)) + } +} + +#[derive(Debug, FilterParameters)] +struct Crc32HexArgs { + #[parameter( + description = "Number of trailing hexadecimal digits to return (zero padded)", + arg_type = "integer" + )] + digits: Option, +} + +#[derive(Clone, ParseFilter, FilterReflection, Default)] +#[filter( + name = "crc32_hex", + description = "Compute the CRC32 and optionally return the last N hexadecimal digits.", + parameters(Crc32HexArgs), + parsed(Crc32Hex) +)] +pub struct Crc32HexFilter; + +#[derive(Debug, FromFilterParameters, Display_filter)] +#[name = "crc32_hex"] +struct Crc32Hex { + #[parameters] + args: Crc32HexArgs, +} + +impl Filter for Crc32Hex { + fn evaluate(&self, input: &dyn ValueView, runtime: &dyn Runtime) -> Result { + let args = self.args.evaluate(runtime)?; + let mut hasher = Hasher::new(); + hasher.update(input.to_kstr().as_bytes()); + let checksum = hasher.finalize(); + let mut hex = format!("{checksum:08x}"); + + let digits = args + .digits + .and_then(|value| { + let scalar = Value::scalar(value); + value_to_usize(&scalar) + }) + .unwrap_or(0); + + if digits == 0 { + return Ok(Value::scalar(hex)); + } + + let clamped = digits.min(32); + if clamped > hex.len() { + let mut padded = String::with_capacity(clamped); + for _ in 0..(clamped - hex.len()) { + padded.push('0'); + } + padded.push_str(&hex); + hex = padded; + } else { + let start = hex.len() - clamped; + hex = hex[start..].to_string(); + } + + Ok(Value::scalar(hex)) + } +} + +#[derive(Debug, FilterParameters)] +struct Crc32LeB64Args { + #[parameter( + description = "Number of leading characters from the Base64 string to keep", + arg_type = "integer" + )] + len: Option, +} + +#[derive(Clone, ParseFilter, FilterReflection, Default)] +#[filter( + name = "crc32_le_b64", + description = "Compute the CRC32, encode little-endian bytes as Base64, optionally truncating.", + parameters(Crc32LeB64Args), + parsed(Crc32LeB64) +)] +pub struct Crc32LeB64Filter; + +#[derive(Debug, FromFilterParameters, Display_filter)] +#[name = "crc32_le_b64"] +struct Crc32LeB64 { + #[parameters] + args: Crc32LeB64Args, +} + +impl Filter for Crc32LeB64 { + fn evaluate(&self, input: &dyn ValueView, runtime: &dyn Runtime) -> Result { + let args = self.args.evaluate(runtime)?; + let mut hasher = Hasher::new(); + hasher.update(input.to_kstr().as_bytes()); + let checksum = hasher.finalize(); + let encoded = general_purpose::STANDARD.encode(checksum.to_le_bytes()); + + let output = if let Some(len) = args.len.and_then(|value| { + let scalar = Value::scalar(value); + value_to_usize(&scalar) + }) { + encoded.chars().take(len).collect::() + } else { + encoded + }; + + Ok(Value::scalar(output)) + } +} + #[derive(Debug, FilterParameters)] struct Base62Args { #[parameter( @@ -590,7 +802,11 @@ pub fn register_all(builder: liquid::ParserBuilder) -> liquid::ParserBuilder { .filter(B64DecFilter::default()) .filter(RandomStringFilter::default()) .filter(SuffixFilter::default()) + .filter(PrefixFilter::default()) .filter(Crc32Filter::default()) + .filter(Crc32DecFilter::default()) + .filter(Crc32HexFilter::default()) + .filter(Crc32LeB64Filter::default()) .filter(Base62Filter::default()) .filter(HmacSha256::default()) .filter(HmacSha1::default()) @@ -645,6 +861,13 @@ mod tests { assert_eq!(render(r#"{{ "value" | suffix: 0 }}"#), ""); } + #[test] + fn prefix_filter() { + assert_eq!(render(r#"{{ "abcdef" | prefix: 3 }}"#), "abc"); + assert_eq!(render(r#"{{ "short" | prefix: 10 }}"#), "short"); + assert_eq!(render(r#"{{ "value" | prefix: 0 }}"#), ""); + } + #[test] fn crc32_and_base62_filters() { assert_eq!(render(r#"{{ "hello" | crc32 }}"#), "907060870"); @@ -652,6 +875,25 @@ mod tests { assert_eq!(render(r#"{{ "hello" | crc32 | base62: 6 }}"#), "0zNvy2"); } + #[test] + fn crc32_dec_filter() { + assert_eq!(render(r#"{{ "hello" | crc32_dec }}"#), "907060870"); + assert_eq!(render(r#"{{ "hello" | crc32_dec: 6 }}"#), "060870"); + } + + #[test] + fn crc32_hex_filter() { + assert_eq!(render(r#"{{ "hello" | crc32_hex }}"#), "3610a686"); + assert_eq!(render(r#"{{ "hello" | crc32_hex: 4 }}"#), "a686"); + assert_eq!(render(r#"{{ "hello" | crc32_hex: 10 }}"#), "003610a686"); + } + + #[test] + fn crc32_le_b64_filter() { + assert_eq!(render(r#"{{ "hello" | crc32_le_b64 }}"#), "hqYQNg=="); + assert_eq!(render(r#"{{ "hello" | crc32_le_b64: 6 }}"#), "hqYQNg"); + } + #[test] fn hmac_sha1_filter() { let key = b"key1"; From c856373fb5d8fd17ebca7f7647cc5b34b2d4b8f8 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Sun, 9 Nov 2025 08:42:16 -0800 Subject: [PATCH 17/19] updated confluent rule with a checksum. Added zuplo rule with a checksum --- CHANGELOG.md | 10 +++++---- README.md | 22 ++++++++++++++++++ data/rules/azuredevops.yml | 2 +- data/rules/github.yml | 29 ------------------------ data/rules/zuplo.yml | 14 ++++++++++++ src/liquid_filters.rs | 46 ++++++++++++++++++++++++++++++++++++++ src/main.rs | 33 +++++++++++++-------------- tests/smoke_docker.rs | 1 - 8 files changed, 105 insertions(+), 52 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dabda9a..8f41796 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,14 +3,16 @@ All notable changes to this project will be documented in this file. ## [v1.62.0] -- This release is focused on further improving detection accuracy, before even attempting to validate findings. -- Added `pattern_requirements` checks to rules, providing lightweight post-regex character-class validation without lookarounds. -- Added an `ignore_if_contains` option to `PatternRequirements` to drop matches containing case-insensitive placeholder words, with tests covering the new behavior. +- Added `pattern_requirements` checks to rules, providing lightweight post-regex character-class validation without lookarounds. See docs/RULES.md for detail +- Added an `ignore_if_contains` option to `pattern_requirements` to drop matches containing case-insensitive placeholder words, with tests covering the new behavior. - Updated rules to adopt the new `pattern_requirements` support. - Added checksum comparisons to `pattern_requirements`, new `suffix`, `crc32`, and `base62` Liquid filters, and verbose logging so mismatched checksums are skipped with context rather than reported as findings. - Split GitHub token detections into fine-grained/fixed-format variants and enforce checksum validation for modern GitHub token families (PAT, OAuth, App, refresh) while preserving legacy coverage. +- Added a rule for Zuplo tokens. +- Added checksum calculation for Confluent, GitHub, and Zuplo tokens, which can drastically reduce false positive reports. +- Improved OpsGenie validation. - Automatically enable `--no-dedup` when `--manage-baseline` is supplied so baseline management keeps every finding. - +- This release is focused on further improving detection accuracy, before even attempting to validate findings. ## [v1.61.0] - Fixed local filesystem scans to keep `open_path_as_is` enabled when opening Git repositories and only disable it for diff-based scans. diff --git a/README.md b/README.md index 72736b9..81b38b1 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,7 @@ For a look at how Kingfisher has grown from its early foundations into today's f - **Broad AI SaaS coverage**: finds and validates tokens for OpenAI, Anthropic, Google Gemini, Cohere, Mistral, Stability AI, Replicate, xAI (Grok), Ollama, Langchain, Perplexity, Weights & Biases, Cerebras, Friendli, Fireworks.ai, NVIDIA NIM, Together.ai, Zhipu, and many more - **Compressed Files**: Supports extracting and scanning compressed files for secrets - **Baseline management**: generate and track baselines to suppress known secrets ([docs/BASELINE.md](/docs/BASELINE.md)) +- **Checksum-aware detection**: verifies tokens with built-in checksums (e.g., GitHub, Confluent, Zuplo) — no API calls required **Learn more:** [Introducing Kingfisher: Real‑Time Secret Detection and Validation](https://www.mongodb.com/blog/post/product-release-announcements/introducing-kingfisher-real-time-secret-detection-validation) @@ -68,6 +69,7 @@ See ([docs/COMPARISON.md](docs/COMPARISON.md)) - [🔐 Detection Rules at a Glance](#-detection-rules-at-a-glance) - [📝 Write Custom Rules!](#-write-custom-rules) - [Pattern requirements and placeholder filtering](#pattern-requirements-and-placeholder-filtering) + - [🔍 Checksum Intelligence (New!)](#-checksum-intelligence-new) - [🎉 Usage](#-usage) - [Basic Examples](#basic-examples) - [Scan with secret validation](#scan-with-secret-validation) @@ -343,6 +345,26 @@ checksum mismatch lengths so you can confirm why a finding was suppressed. Once you've done that, you can provide your custom rules (defined in a YAML file) and provide it to Kingfisher at runtime --- no recompiling required! +### 🔍 Checksum Intelligence (New!) + +Modern API tokens increasingly include **built-in checksums**, short internal digests that make each credential self-verifiable. (For background, see [GitHub’s write-up on their newer token formats](https://github.blog/engineering/platform-security/behind-githubs-new-authentication-token-formats/) and why checksums slash false positives.) + +Kingfisher supports **checksum-aware matching** in rules, enabling **offline structural verification** of credentials *without* calling third-party APIs. + +By validating each token’s internal checksum (for tokens that support checksums), Kingfisher eliminates nearly all false positives—automatically skipping structurally invalid or fake tokens before validation ever runs. + +**Why this matters** +- ✅ **Offline verification** — no API call required +- 🧠 **Industry-aligned** — compatible with prefix + checksum token designs (e.g., modern PATs) +- ⚡ **Lower false positives** — invalid tokens are filtered out by structure alone + +**Learn more**: implementation details and templating are documented in **[docs/RULES.md](docs/RULES.md)** + +--- + + +- **Checksum-aware detection**: verifies tokens with embedded checksums (offline) to cut false positives — see [docs/RULES.md](docs/RULES.md) + # 🎉 Usage ## Basic Examples diff --git a/data/rules/azuredevops.yml b/data/rules/azuredevops.yml index 8a21d80..90fa4e8 100644 --- a/data/rules/azuredevops.yml +++ b/data/rules/azuredevops.yml @@ -21,7 +21,7 @@ rules: (?xi) \b ( - [a-z0-9]{75,76}AZDO[a-z0-9]{4,5} + [a-z0-9]{76}AZDO[a-z0-9]{4,5} ) \b pattern_requirements: diff --git a/data/rules/github.yml b/data/rules/github.yml index 334c7ed..3aa8d7d 100644 --- a/data/rules/github.yml +++ b/data/rules/github.yml @@ -277,32 +277,3 @@ rules: - | GITHUB_CLIENT_ID=ac58d6da7d7a84c039b7 GITHUB_SECRET=37d02377a3e9d849e18704c3ec883f9c5787d857 - - name: GitHub Personal Access Token (fine-grained permissions) - id: kingfisher.github.9 - pattern: | - (?xi) - ( - github_pat_[0-9A-Z_]{82} - ) - examples: - - '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' - validation: - type: Http - content: - request: - method: POST - url: https://api.github.com/graphql - headers: - Authorization: token {{ TOKEN }} - Accept: application/vnd.github+json - Content-Type: application/json - body: | - { - "query": "{ viewer { login } }" - } - response_matcher: - - report_response: true - - match_all_words: true - type: WordMatch - words: - - '"login"' \ No newline at end of file diff --git a/data/rules/zuplo.yml b/data/rules/zuplo.yml index 22ed4c1..bbfdb7b 100644 --- a/data/rules/zuplo.yml +++ b/data/rules/zuplo.yml @@ -20,3 +20,17 @@ rules: - zpka_b3f94d8d3d4d4a6ea5c5b20d0a5bb407_18eb262b references: - https://zuplo.com/blog/api-key-authentication + validation: + type: Http + content: + request: + headers: + authorization: "Bearer {{ TOKEN }}" + x-api-key: "{{ TOKEN }}" + method: GET + response_matcher: + - report_response: true + - status: + - 200 + type: StatusMatch + url: https://dev.zuplo.com/v1/who-am-i \ No newline at end of file diff --git a/src/liquid_filters.rs b/src/liquid_filters.rs index 9112fb6..66a2fab 100644 --- a/src/liquid_filters.rs +++ b/src/liquid_filters.rs @@ -109,6 +109,41 @@ impl Filter for ReplaceFilter { } } +#[derive(Debug, FilterParameters)] +struct LstripCharsArgs { + #[parameter( + description = "Characters to remove from the start of the input.", + arg_type = "str" + )] + chars: Expression, +} + +#[derive(Clone, ParseFilter, FilterReflection, Default)] +#[filter( + name = "lstrip_chars", + description = "Removes the provided characters from the beginning of the string.", + parameters(LstripCharsArgs), + parsed(LstripCharsFilter) +)] +pub struct LstripChars; + +#[derive(Debug, FromFilterParameters, Display_filter)] +#[name = "lstrip_chars"] +struct LstripCharsFilter { + #[parameters] + args: LstripCharsArgs, +} + +impl Filter for LstripCharsFilter { + fn evaluate(&self, input: &dyn ValueView, runtime: &dyn Runtime) -> Result { + let args = self.args.evaluate(runtime)?; + let chars = args.chars.to_string(); + let input_str = input.to_kstr(); + let trimmed = input_str.trim_start_matches(|c| chars.contains(c)).to_string(); + Ok(Value::scalar(trimmed)) + } +} + // ── HMAC args ───────────────────────────────────── #[derive(Debug, FilterParameters)] struct HmacArgs { @@ -803,6 +838,7 @@ pub fn register_all(builder: liquid::ParserBuilder) -> liquid::ParserBuilder { .filter(RandomStringFilter::default()) .filter(SuffixFilter::default()) .filter(PrefixFilter::default()) + .filter(LstripChars::default()) .filter(Crc32Filter::default()) .filter(Crc32DecFilter::default()) .filter(Crc32HexFilter::default()) @@ -1013,6 +1049,16 @@ mod tests { assert_eq!(render(r#"{{ "hello world" | replace: "world", "mars" }}"#), "hello mars"); } + #[test] + fn lstrip_chars_single() { + assert_eq!(render(r#"{{ "000abc" | lstrip_chars: "0" }}"#), "abc"); + } + + #[test] + fn lstrip_chars_multiple_chars() { + assert_eq!(render(r#"{{ "-=--token" | lstrip_chars: "-=" }}"#), "token"); + } + // ------------------------------------------------------------------------- // iso_timestamp_no_frac filter // ------------------------------------------------------------------------- diff --git a/src/main.rs b/src/main.rs index 99f0718..de047db 100644 --- a/src/main.rs +++ b/src/main.rs @@ -5,27 +5,26 @@ // * Fallback - system allocator (`system-alloc` feature) // ──────────────────────────────────────────────────────────── -// // --- jemalloc (opt-in) --- -// #[cfg(feature = "use-jemalloc")] -// #[global_allocator] -// static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; +// --- jemalloc (opt-in) --- +#[cfg(feature = "use-jemalloc")] +#[global_allocator] +static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; -// // --- mimalloc (default) --- -// #[cfg(all(not(feature = "use-jemalloc"), not(feature = "system-alloc")))] -// #[global_allocator] -// static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; +// --- mimalloc (default) --- +#[cfg(all(not(feature = "use-jemalloc"), not(feature = "system-alloc")))] +#[global_allocator] +static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; -// // --- system allocator (explicit opt-out) --- -// #[cfg(feature = "system-alloc")] -// use std::alloc::System; -// #[cfg(feature = "system-alloc")] -// #[global_allocator] +// --- system allocator (explicit opt-out) --- +#[cfg(feature = "system-alloc")] +use std::alloc::System; +#[cfg(feature = "system-alloc")] +#[global_allocator] // static GLOBAL: System = System; -use std::alloc::System; -#[global_allocator] -static GLOBAL: System = System; - +// use std::alloc::System; +// #[global_allocator] +// static GLOBAL: System = System; use std::{ io::{IsTerminal, Read}, sync::{Arc, Mutex}, diff --git a/tests/smoke_docker.rs b/tests/smoke_docker.rs index 46e22c7..3bd4307 100644 --- a/tests/smoke_docker.rs +++ b/tests/smoke_docker.rs @@ -1,4 +1,3 @@ -use assert_cmd::prelude::*; use std::process::Command; #[test] From 778303f0aeef58f1e164afbfdaad5047cd7a2693 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Sun, 9 Nov 2025 08:47:19 -0800 Subject: [PATCH 18/19] updated confluent rule with a checksum. Added zuplo rule with a checksum --- src/main.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main.rs b/src/main.rs index de047db..b52fcd3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -20,11 +20,12 @@ static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; use std::alloc::System; #[cfg(feature = "system-alloc")] #[global_allocator] -// static GLOBAL: System = System; +static GLOBAL: System = System; // use std::alloc::System; // #[global_allocator] // static GLOBAL: System = System; + use std::{ io::{IsTerminal, Read}, sync::{Arc, Mutex}, From 77bcbd6130283e6c6a5697ae57585d924f868fcd Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Sun, 9 Nov 2025 09:16:50 -0800 Subject: [PATCH 19/19] changes in response to code review --- docs/RULES.md | 15 ++++++++++++++- src/rules/rule.rs | 2 +- src/validation.rs | 15 +++++++-------- 3 files changed, 22 insertions(+), 10 deletions(-) diff --git a/docs/RULES.md b/docs/RULES.md index 9e4d8e5..4a3dd8d 100644 --- a/docs/RULES.md +++ b/docs/RULES.md @@ -292,6 +292,19 @@ The optional `checksum` block renders Liquid templates against the match to dete When any of these filters remove a match it is logged at the `DEBUG` level so you can see exactly why the skip occurred. If you need to keep every match even when one of these substrings appears, pass `--no-ignore-if-contains` to `kingfisher scan`. The flag disables this post-processing step without changing the rule definitions. +### Are `requires_capture` and `skip_if_missing` equivalent? + +`requires_capture` + - Optional field that names a specific regex capture that must be present before the checksum templates are evaluated. + - In the engine, Kingfisher checks whether that capture exists in the match context. If it’s missing, the behavior falls back to whatever `skip_if_missing` dictates (fail or treat as a legacy match). + +`skip_if_missing` + - Boolean switch that controls what happens when Kingfisher can’t render the checksum—because there’s no match context or a required capture is absent. + - `true`: silently skip (pass) the match so legacy, non-checksum tokens are still accepted. + - `false`: treat the situation as a validation failure. + +In short, `requires_capture` identifies which capture must exist, while `skip_if_missing` determines whether missing data is a hard failure or an allowed legacy case. + ### Example: Secure API Key ```yaml @@ -316,7 +329,7 @@ rules: - test examples: - api_key = "MyS3cur3K3y!2024" - - api-key: "Abc123!@#Token" + - 'api-key: "Abc123!@#Token"' ``` In this example: diff --git a/src/rules/rule.rs b/src/rules/rule.rs index 2284e1f..1c2fb08 100644 --- a/src/rules/rule.rs +++ b/src/rules/rule.rs @@ -12,7 +12,7 @@ use anyhow::{anyhow, Context, Result}; use lazy_static::lazy_static; use liquid::{ model::{KString, Value}, - object, Parser, ParserBuilder, + object, ParserBuilder, }; use regex::Regex; use schemars::{ diff --git a/src/validation.rs b/src/validation.rs index f93ec11..778127c 100644 --- a/src/validation.rs +++ b/src/validation.rs @@ -963,21 +963,20 @@ fn populate_globals_from_captures( globals: &mut Object, captured_values: &[(String, String, usize, usize)], ) { - let mut best_token: Option<(usize, String)> = None; + let mut best_token: Option<&String> = None; for (k, v, ..) in captured_values { - let key = k.to_uppercase(); - if key == "TOKEN" { - if best_token.as_ref().map_or(true, |(len, _)| v.len() >= *len) { - best_token = Some((v.len(), v.clone())); + if k.eq_ignore_ascii_case("TOKEN") { + if best_token.map_or(true, |best| v.len() >= best.len()) { + best_token = Some(v); } } else { - globals.insert(key.into(), Value::scalar(v.clone())); + globals.insert(k.to_uppercase().into(), Value::scalar(v.clone())); } } - if let Some((_, token)) = best_token { - globals.insert("TOKEN".into(), Value::scalar(token)); + if let Some(token) = best_token { + globals.insert("TOKEN".into(), Value::scalar(token.clone())); } }