# robots.txt for hellojade.ai
# ---------------------------------------------------------------------------
# Implements the Robots Exclusion Protocol (REP / RFC 9309) exactly as Google
# interprets it. Derived from:
#   developers.google.com/crawling/docs/robots-txt/robots-txt-spec
#   developers.google.com/crawling/docs/robots-txt/create-robots-txt
#   developers.google.com/crawling/docs/robots-txt/useful-robots-txt-rules
#
# RULES OF THE FILE (enforced by Google):
#   - Must live at the ROOT of the host:  https://hellojade.ai/robots.txt
#     (a robots.txt in a subdirectory is ignored; one file per host/protocol/port).
#   - UTF-8 plain text, lines separated by CR, CR/LF, or LF. Max 500 KiB.
#   - Supported fields ONLY: user-agent, allow, disallow, sitemap.
#     crawl-delay is NOT supported by Google and is ignored.
#   - Field names are case-INSENSITIVE; path VALUES are case-SENSITIVE.
#   - Wildcards: '*' = 0+ of any char; '$' = end of URL. (allow/disallow only.)
#   - Precedence: the MOST SPECIFIC (longest matching path) rule wins; on a tie,
#     the LEAST RESTRICTIVE (allow) wins.
#   - The global group (User-agent: *) does NOT apply to AdsBot / Mediapartners /
#     APIs-Google — those must be named explicitly (they ignore '*').
# ---------------------------------------------------------------------------

# === Global group: applies to all crawlers EXCEPT the ones that ignore '*' ===
User-agent: *
# Block crawl-budget sinks: faceted-navigation parameter URLs.
# (see /crawling/docs/faceted-navigation — disallow filter params, allow the
#  unfiltered listing.)
Disallow: /*?*sort=
Disallow: /*?*filter=
Disallow: /*?*color=
Disallow: /*?*size=
Disallow: /*?*products=
Allow: /*?products=all$
# Private / non-public areas (do NOT rely on robots.txt for secrets — use auth).
Disallow: /admin/
Disallow: /cart/
Disallow: /checkout/
Disallow: /search
Disallow: /*?*session=
# Internal includes (CSS/JS) blocked globally, but re-allowed for Googlebot
# below so Google can render pages correctly.
Disallow: /includes/

# === Googlebot: allow render resources it needs (overrides the global block) ===
# Re-allowing /includes/ here ensures Googlebot can fetch CSS/JS for rendering.
User-agent: Googlebot
Allow: /includes/
Allow: /*?products=all$
Disallow: /admin/
Disallow: /cart/
Disallow: /checkout/
Disallow: /*?*sort=
Disallow: /*?*filter=

# === Googlebot-Image / Googlebot-News / Googlebot-Video (inherit nothing
#     automatically — declare explicitly if you need different rules) ===
User-agent: Googlebot-Image
Allow: /images/
Disallow: /admin/

User-agent: Googlebot-News
Disallow: /admin/
Disallow: /drafts/

# === Google Shopping ===
User-agent: Storebot-Google
Allow: /products/
Disallow: /admin/

# === AI / Gemini training opt-out control token (no effect on Search) ===
# Uncomment the Disallow to opt OUT of content being used for Gemini training.
User-agent: Google-Extended
Disallow:
# Disallow: /

# === Vertex AI site search build (site-owner requested) ===
User-agent: Google-CloudVertexBot
Disallow: /admin/

# === Generic Google R&D crawler ===
User-agent: GoogleOther
Disallow: /admin/
Disallow: /cart/

# === AdsBot family — MUST be named explicitly; they ignore '*'. ===
# Leaving these unrestricted so Google Ads can verify landing-page ad quality.
User-agent: AdsBot-Google
Disallow: /admin/

User-agent: AdsBot-Google-Mobile
Disallow: /admin/

# === AdSense — also ignores '*'. Allow so relevant ads can be served. ===
User-agent: Mediapartners-Google
Disallow: /admin/

# === APIs-Google (push notifications) — ignores '*'. ===
User-agent: APIs-Google
Disallow: /admin/

# ---------------------------------------------------------------------------
# Sitemaps: fully-qualified absolute URLs. Not tied to any user-agent; followed
# by all crawlers. Multiple allowed.
# ---------------------------------------------------------------------------
Sitemap: https://hellojade.ai/sitemap.xml