commit c9e0dd287f30b2acb0145a7efc326c881792138a Author: David Fifield david@bamsoftware.com Date: Sun Jul 18 15:22:03 2021 -0600
amp package.
This package contains a CacheURL function that modifies a URL to be accessed through an AMP cache, and the "AMP armor" data encoding scheme for encoding data into the AMP subset of HTML. --- common/amp/armor_decoder.go | 136 +++++++++++++++++++ common/amp/armor_encoder.go | 176 ++++++++++++++++++++++++ common/amp/armor_test.go | 227 +++++++++++++++++++++++++++++++ common/amp/cache.go | 178 ++++++++++++++++++++++++ common/amp/cache_test.go | 320 ++++++++++++++++++++++++++++++++++++++++++++ common/amp/doc.go | 88 ++++++++++++ common/amp/path.go | 44 ++++++ common/amp/path_test.go | 54 ++++++++ 8 files changed, 1223 insertions(+)
diff --git a/common/amp/armor_decoder.go b/common/amp/armor_decoder.go new file mode 100644 index 0000000..fed44a6 --- /dev/null +++ b/common/amp/armor_decoder.go @@ -0,0 +1,136 @@ +package amp + +import ( + "bufio" + "bytes" + "encoding/base64" + "fmt" + "io" + + "golang.org/x/net/html" +) + +// ErrUnknownVersion is the error returned when the first character inside the +// element encoding (but outside the base64 encoding) is not '0'. +type ErrUnknownVersion byte + +func (err ErrUnknownVersion) Error() string { + return fmt.Sprintf("unknown armor version indicator %+q", byte(err)) +} + +func isASCIIWhitespace(b byte) bool { + switch b { + // https://infra.spec.whatwg.org/#ascii-whitespace + case '\x09', '\x0a', '\x0c', '\x0d', '\x20': + return true + default: + return false + } +} + +func splitASCIIWhitespace(data []byte, atEOF bool) (advance int, token []byte, err error) { + var i, j int + // Skip initial whitespace. + for i = 0; i < len(data); i++ { + if !isASCIIWhitespace(data[i]) { + break + } + } + // Look for next whitespace. + for j = i; j < len(data); j++ { + if isASCIIWhitespace(data[j]) { + return j + 1, data[i:j], nil + } + } + // We reached the end of data without finding more whitespace. Only + // consider it a token if we are at EOF. + if atEOF && i < j { + return j, data[i:j], nil + } + // Otherwise, request more data. + return i, nil, nil +} + +func decodeToWriter(w io.Writer, r io.Reader) (int64, error) { + tokenizer := html.NewTokenizer(r) + // Set a memory limit on token sizes, otherwise the tokenizer will + // buffer text indefinitely if it is not broken up by other token types. + tokenizer.SetMaxBuf(elementSizeLimit) + active := false + total := int64(0) + for { + tt := tokenizer.Next() + switch tt { + case html.ErrorToken: + err := tokenizer.Err() + if err == io.EOF { + err = nil + } + if err == nil && active { + return total, fmt.Errorf("missing </pre> tag") + } + return total, err + case html.TextToken: + if active { + // Re-join the separate chunks of text and + // feed them to the decoder. + scanner := bufio.NewScanner(bytes.NewReader(tokenizer.Text())) + scanner.Split(splitASCIIWhitespace) + for scanner.Scan() { + n, err := w.Write(scanner.Bytes()) + total += int64(n) + if err != nil { + return total, err + } + } + if err := scanner.Err(); err != nil { + return total, err + } + } + case html.StartTagToken: + tn, _ := tokenizer.TagName() + if string(tn) == "pre" { + if active { + // nesting not allowed + return total, fmt.Errorf("unexpected %s", tokenizer.Token()) + } + active = true + } + case html.EndTagToken: + tn, _ := tokenizer.TagName() + if string(tn) == "pre" { + if !active { + // stray end tag + return total, fmt.Errorf("unexpected %s", tokenizer.Token()) + } + active = false + } + } + } +} + +// NewArmorDecoder returns a new AMP armor decoder. +func NewArmorDecoder(r io.Reader) (io.Reader, error) { + pr, pw := io.Pipe() + go func() { + _, err := decodeToWriter(pw, r) + pw.CloseWithError(err) + }() + + // The first byte inside the element encoding is a server–client + // protocol version indicator. + var version [1]byte + _, err := pr.Read(version[:]) + if err != nil { + pr.CloseWithError(err) + return nil, err + } + switch version[0] { + case '0': + return base64.NewDecoder(base64.StdEncoding, pr), nil + default: + err := ErrUnknownVersion(version[0]) + pr.CloseWithError(err) + return nil, err + } +} diff --git a/common/amp/armor_encoder.go b/common/amp/armor_encoder.go new file mode 100644 index 0000000..5d6b0ae --- /dev/null +++ b/common/amp/armor_encoder.go @@ -0,0 +1,176 @@ +package amp + +import ( + "encoding/base64" + "io" +) + +// https://amp.dev/boilerplate/ +// https://amp.dev/documentation/guides-and-tutorials/learn/spec/amp-boilerplat... +// https://amp.dev/documentation/guides-and-tutorials/learn/spec/amphtml/?forma... +const ( + boilerplateStart = `<!doctype html> +<html amp> +<head> +<meta charset="utf-8"> +<script async src="https://cdn.ampproject.org/v0.js"></script> +<link rel="canonical" href="#"> +<meta name="viewport" content="width=device-width"> +<style amp-boilerplate>body{-webkit-animation:-amp-start 8s steps(1,end) 0s 1 normal both;-moz-animation:-amp-start 8s steps(1,end) 0s 1 normal both;-ms-animation:-amp-start 8s steps(1,end) 0s 1 normal both;animation:-amp-start 8s steps(1,end) 0s 1 normal both}@-webkit-keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}@-moz-keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}@-ms-keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}@-o-keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}@keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}</style><noscript><style amp-boilerplate>body{-webkit-animation:none;-moz-animation:none;-ms-animation:none;animation:none}</style></noscript> +</head> +<body> +` + boilerplateEnd = `</body> +</html>` +) + +const ( + // We restrict the amount of text may go inside an HTML element, in + // order to limit the amount a decoder may have to buffer. + elementSizeLimit = 32 * 1024 + + // The payload is conceptually a long base64-encoded string, but we + // break the string into short chunks separated by whitespace. This is + // to protect against modification by AMP caches, which reportedly may + // truncate long words in text: + // https://bugs.torproject.org/tpo/anti-censorship/pluggable-transports/snowfla... + bytesPerChunk = 32 + + // We set the number of chunks per element so as to stay under + // elementSizeLimit. Here, we assume that there is 1 byte of whitespace + // after each chunk (with an additional whitespace byte at the beginning + // of the element). + chunksPerElement = (elementSizeLimit - 1) / (bytesPerChunk + 1) +) + +// The AMP armor encoder is a chain of a base64 encoder (base64.NewEncoder) and +// an HTML element encoder (elementEncoder). A top-level encoder (armorEncoder) +// coordinates these two, and handles prepending and appending the AMP +// boilerplate. armorEncoder's Write method writes data into the base64 encoder, +// where it makes its way through the chain. + +// NewArmorEncoder returns a new AMP armor encoder. Anything written to the +// returned io.WriteCloser will be encoded and written to w. The caller must +// call Close to flush any partially written data and output the AMP boilerplate +// trailer. +func NewArmorEncoder(w io.Writer) (io.WriteCloser, error) { + // Immediately write the AMP boilerplate header. + _, err := w.Write([]byte(boilerplateStart)) + if err != nil { + return nil, err + } + + element := &elementEncoder{w: w} + // Write a server–client protocol version indicator, outside the base64 + // layer. + _, err = element.Write([]byte{'0'}) + if err != nil { + return nil, err + } + + base64 := base64.NewEncoder(base64.StdEncoding, element) + return &armorEncoder{ + w: w, + element: element, + base64: base64, + }, nil +} + +type armorEncoder struct { + base64 io.WriteCloser + element *elementEncoder + w io.Writer +} + +func (enc *armorEncoder) Write(p []byte) (int, error) { + // Write into the chain base64 | element | w. + return enc.base64.Write(p) +} + +func (enc *armorEncoder) Close() error { + // Close the base64 encoder first, to flush out any buffered data and + // the final padding. + err := enc.base64.Close() + if err != nil { + return err + } + + // Next, close the element encoder, to close any open elements. + err = enc.element.Close() + if err != nil { + return err + } + + // Finally, output the AMP boilerplate trailer. + _, err = enc.w.Write([]byte(boilerplateEnd)) + if err != nil { + return err + } + + return nil +} + +// elementEncoder arranges written data into pre elements, with the text within +// separated into chunks. It does no HTML encoding, so data written must not +// contain any bytes that are meaningful in HTML. +type elementEncoder struct { + w io.Writer + chunkCounter int + elementCounter int +} + +func (enc *elementEncoder) Write(p []byte) (n int, err error) { + total := 0 + for len(p) > 0 { + if enc.elementCounter == 0 && enc.chunkCounter == 0 { + _, err := enc.w.Write([]byte("<pre>\n")) + if err != nil { + return total, err + } + } + + n := bytesPerChunk - enc.chunkCounter + if n > len(p) { + n = len(p) + } + nn, err := enc.w.Write(p[:n]) + if err != nil { + return total, err + } + total += nn + p = p[n:] + + enc.chunkCounter += n + if enc.chunkCounter >= bytesPerChunk { + enc.chunkCounter = 0 + enc.elementCounter += 1 + nn, err = enc.w.Write([]byte("\n")) + if err != nil { + return total, err + } + total += nn + } + + if enc.elementCounter >= chunksPerElement { + enc.elementCounter = 0 + nn, err = enc.w.Write([]byte("</pre>\n")) + if err != nil { + return total, err + } + total += nn + } + } + return total, nil +} + +func (enc *elementEncoder) Close() error { + var err error + if !(enc.elementCounter == 0 && enc.chunkCounter == 0) { + if enc.chunkCounter == 0 { + _, err = enc.w.Write([]byte("</pre>\n")) + } else { + _, err = enc.w.Write([]byte("\n</pre>\n")) + } + } + return err +} diff --git a/common/amp/armor_test.go b/common/amp/armor_test.go new file mode 100644 index 0000000..594ae65 --- /dev/null +++ b/common/amp/armor_test.go @@ -0,0 +1,227 @@ +package amp + +import ( + "crypto/rand" + "io" + "io/ioutil" + "strings" + "testing" +) + +func armorDecodeToString(src string) (string, error) { + dec, err := NewArmorDecoder(strings.NewReader(src)) + if err != nil { + return "", err + } + p, err := ioutil.ReadAll(dec) + return string(p), err +} + +func TestArmorDecoder(t *testing.T) { + for _, test := range []struct { + input string + expectedOutput string + expectedErr bool + }{ + {` +<pre> +0 +</pre> +`, + "", + false, + }, + {` +<pre> +0aGVsbG8gd29ybGQK +</pre> +`, + "hello world\n", + false, + }, + // bad version indicator + {` +<pre> +1aGVsbG8gd29ybGQK +</pre> +`, + "", + true, + }, + // text outside <pre> elements + {` +0aGVsbG8gd29ybGQK +blah blah blah +<pre> +0aGVsbG8gd29ybGQK +</pre> +0aGVsbG8gd29ybGQK +blah blah blah +`, + "hello world\n", + false, + }, + {` +<pre> +0QUJDREV +GR0hJSkt +MTU5PUFF +SU1RVVld +</pre> +junk +<pre> +YWVowMTI +zNDU2Nzg +5Cg += +</pre> +<pre> += +</pre> +`, + "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789\n", + false, + }, + // no <pre> elements, hence no version indicator + {` +aGVsbG8gd29ybGQK +blah blah blah +aGVsbG8gd29ybGQK +aGVsbG8gd29ybGQK +blah blah blah +`, + "", + true, + }, + // empty <pre> elements, hence no version indicator + {` +aGVsbG8gd29ybGQK +blah blah blah +<pre> </pre> +aGVsbG8gd29ybGQK +aGVsbG8gd29ybGQK<pre></pre> +blah blah blah +`, + "", + true, + }, + // other elements inside <pre> + { + "blah <pre>0aGVsb<p>G8gd29</p>ybGQK</pre>", + "hello world\n", + false, + }, + // HTML comment + { + "blah <!-- <pre>aGVsbG8gd29ybGQK</pre> -->", + "", + true, + }, + // all kinds of ASCII whitespace + { + "blah <pre>\x200\x09aG\x0aV\x0csb\x0dG8\x20gd29ybGQK</pre>", + "hello world\n", + false, + }, + + // bad padding + {` +<pre> +0QUJDREV +GR0hJSkt +MTU5PUFF +SU1RVVld +</pre> +junk +<pre> +YWVowMTI +zNDU2Nzg +5Cg += +</pre> +`, + "", + true, + }, + /* + // per-chunk base64 + // test disabled because Go stdlib handles this incorrectly: + // https://github.com/golang/go/issues/31626 + { + "<pre>QQ==</pre><pre>Qg==</pre>", + "", + true, + }, + */ + // missing </pre> + { + "blah <pre></pre><pre>0aGVsbG8gd29ybGQK", + "", + true, + }, + // nested <pre> + { + "blah <pre>0aGVsb<pre>G8gd29</pre>ybGQK</pre>", + "", + true, + }, + } { + output, err := armorDecodeToString(test.input) + if test.expectedErr && err == nil { + t.Errorf("%+q → (%+q, %v), expected error", test.input, output, err) + continue + } + if !test.expectedErr && err != nil { + t.Errorf("%+q → (%+q, %v), expected no error", test.input, output, err) + continue + } + if !test.expectedErr && output != test.expectedOutput { + t.Errorf("%+q → (%+q, %v), expected (%+q, %v)", + test.input, output, err, test.expectedOutput, nil) + continue + } + } +} + +func armorRoundTrip(s string) (string, error) { + var encoded strings.Builder + enc, err := NewArmorEncoder(&encoded) + if err != nil { + return "", err + } + _, err = io.Copy(enc, strings.NewReader(s)) + if err != nil { + return "", err + } + err = enc.Close() + if err != nil { + return "", err + } + return armorDecodeToString(encoded.String()) +} + +func TestArmorRoundTrip(t *testing.T) { + lengths := make([]int, 0) + // Test short strings and lengths around elementSizeLimit thresholds. + for i := 0; i < bytesPerChunk*2; i++ { + lengths = append(lengths, i) + } + for i := -10; i < +10; i++ { + lengths = append(lengths, elementSizeLimit+i) + lengths = append(lengths, 2*elementSizeLimit+i) + } + for _, n := range lengths { + buf := make([]byte, n) + rand.Read(buf) + input := string(buf) + output, err := armorRoundTrip(input) + if err != nil { + t.Errorf("length %d → error %v", n, err) + continue + } + if output != input { + t.Errorf("length %d → %+q", n, output) + continue + } + } +} diff --git a/common/amp/cache.go b/common/amp/cache.go new file mode 100644 index 0000000..102993f --- /dev/null +++ b/common/amp/cache.go @@ -0,0 +1,178 @@ +package amp + +import ( + "crypto/sha256" + "encoding/base32" + "fmt" + "net" + "net/url" + "path" + "strings" + + "golang.org/x/net/idna" +) + +// domainPrefixBasic does the basic domain prefix conversion. Does not do any +// IDNA mapping, such as https://www.unicode.org/reports/tr46/. +// +// https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors... +func domainPrefixBasic(domain string) (string, error) { + // 1. Punycode Decode the publisher domain. + prefix, err := idna.ToUnicode(domain) + if err != nil { + return "", err + } + + // 2. Replace any "-" (hyphen) character in the output of step 1 with + // "--" (two hyphens). + prefix = strings.Replace(prefix, "-", "--", -1) + + // 3. Replace any "." (dot) character in the output of step 2 with "-" + // (hyphen). + prefix = strings.Replace(prefix, ".", "-", -1) + + // 4. If the output of step 3 has a "-" (hyphen) at both positions 3 and + // 4, then to the output of step 3, add a prefix of "0-" and add a + // suffix of "-0". + if len(prefix) >= 4 && prefix[2] == '-' && prefix[3] == '-' { + prefix = "0-" + prefix + "-0" + } + + // 5. Punycode Encode the output of step 3. + return idna.ToASCII(prefix) +} + +// Lower-case base32 without padding. +var fallbackBase32Encoding = base32.NewEncoding("abcdefghijklmnopqrstuvwxyz234567").WithPadding(base32.NoPadding) + +// domainPrefixFallback does the fallback domain prefix conversion. The returned +// base32 domain uses lower-case letters. +// +// https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors... +func domainPrefixFallback(domain string) string { + // The algorithm specification does not say what, exactly, we are to + // take the SHA-256 of. domain is notionally an abstract Unicode + // string, not a byte sequence. While + // https://github.com/ampproject/amp-toolbox/blob/84cb3057e5f6c54d64369ddd285db... + // says "Take the SHA256 of the punycode view of the domain," in reality + // it hashes the UTF-8 encoding of the domain, without Punycode: + // https://github.com/ampproject/amp-toolbox/blob/84cb3057e5f6c54d64369ddd285db... + // https://github.com/ampproject/amp-toolbox/blob/84cb3057e5f6c54d64369ddd285db... + // We do the same here, hashing the raw bytes of domain, presumed to be + // UTF-8. + + // 1. Hash the publisher's domain using SHA256. + h := sha256.Sum256([]byte(domain)) + + // 2. Base32 Escape the output of step 1. + // 3. Remove the last 4 characters from the output of step 2, which are + // always "=" (equals) characters. + return fallbackBase32Encoding.EncodeToString(h[:]) +} + +// domainPrefix computes the domain prefix of an AMP cache URL. +// +// https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors... +func domainPrefix(domain string) string { + // https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors... + // 1. Run the Basic Algorithm. If the output is a valid DNS label, + // [append the Cache domain suffix and] return. Otherwise continue to + // step 2. + prefix, err := domainPrefixBasic(domain) + // "A domain prefix is not a valid DNS label if it is longer than 63 + // characters" + if err == nil && len(prefix) <= 63 { + return prefix + } + // 2. Run the Fallback Algorithm. [Append the Cache domain suffix and] + // return. + return domainPrefixFallback(domain) +} + +// CacheURL computes the AMP cache URL for the publisher URL pubURL, using the +// AMP cache at cacheURL. contentType is a string such as "c" or "i" that +// indicates what type of serving the AMP cache is to perform. The Scheme of +// pubURL must be "http" or "https". The Port of pubURL, if any, must match the +// default for the scheme. cacheURL may not have RawQuery, Fragment, or +// RawFragment set, because the resulting URL's query and fragment are taken +// from the publisher URL. +// +// https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors... +func CacheURL(pubURL, cacheURL *url.URL, contentType string) (*url.URL, error) { + // The cache URL subdomain, including the domain prefix corresponding to + // the publisher URL's domain. + resultHost := domainPrefix(pubURL.Hostname()) + "." + cacheURL.Hostname() + if cacheURL.Port() != "" { + resultHost = net.JoinHostPort(resultHost, cacheURL.Port()) + } + + // https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors... + // The first part of the path is the cache URL's own path, if any. + pathComponents := []string{cacheURL.EscapedPath()} + // The next path component is the content type. We cannot encode an + // empty content type, because it would result in consecutive path + // separators, which would semantically combine into a single separator. + if contentType == "" { + return nil, fmt.Errorf("invalid content type %+q", contentType) + } + pathComponents = append(pathComponents, url.PathEscape(contentType)) + // Then, we add an "s" path component, if the publisher URL scheme is + // "https". + switch pubURL.Scheme { + case "http": + // Do nothing. + case "https": + pathComponents = append(pathComponents, "s") + default: + return nil, fmt.Errorf("invalid scheme %+q in publisher URL", pubURL.Scheme) + } + // The next path component is the publisher URL's host. The AMP cache + // URL format specification is not clear about whether other + // subcomponents of the authority (namely userinfo and port) may appear + // here. We adopt a policy of forbidding userinfo, and requiring that + // the port be the default for the scheme (and then we omit the port + // entirely from the returned URL). + if pubURL.User != nil { + return nil, fmt.Errorf("publisher URL may not contain userinfo") + } + if port := pubURL.Port(); port != "" { + if !((pubURL.Scheme == "http" && port == "80") || (pubURL.Scheme == "https" && port == "443")) { + return nil, fmt.Errorf("publisher URL port %+q is not the default for scheme %+q", port, pubURL.Scheme) + } + } + // As with the content type, we cannot encode an empty host, because + // that would result in an empty path component. + if pubURL.Hostname() == "" { + return nil, fmt.Errorf("invalid host %+q in publisher URL", pubURL.Hostname()) + } + pathComponents = append(pathComponents, url.PathEscape(pubURL.Hostname())) + // Finally, we append the remainder of the original escaped path from + // the publisher URL. + pathComponents = append(pathComponents, pubURL.EscapedPath()) + + resultRawPath := path.Join(pathComponents...) + resultPath, err := url.PathUnescape(resultRawPath) + if err != nil { + return nil, err + } + + // The query and fragment of the returned URL always come from pubURL. + // Any query or fragment of cacheURL would be ignored. Return an error + // if either is set. + if cacheURL.RawQuery != "" { + return nil, fmt.Errorf("cache URL may not contain a query") + } + if cacheURL.Fragment != "" { + return nil, fmt.Errorf("cache URL may not contain a fragment") + } + + return &url.URL{ + Scheme: cacheURL.Scheme, + User: cacheURL.User, + Host: resultHost, + Path: resultPath, + RawPath: resultRawPath, + RawQuery: pubURL.RawQuery, + Fragment: pubURL.Fragment, + }, nil +} diff --git a/common/amp/cache_test.go b/common/amp/cache_test.go new file mode 100644 index 0000000..45950fd --- /dev/null +++ b/common/amp/cache_test.go @@ -0,0 +1,320 @@ +package amp + +import ( + "bytes" + "net/url" + "testing" + + "golang.org/x/net/idna" +) + +func TestDomainPrefixBasic(t *testing.T) { + // Tests expecting no error. + for _, test := range []struct { + domain, expected string + }{ + {"", ""}, + {"xn--", ""}, + {"...", "---"}, + + // Should not apply mappings such as case folding and + // normalization. + {"b\u00fccher.de", "xn--bcher-de-65a"}, + {"B\u00fccher.de", "xn--Bcher-de-65a"}, + {"bu\u0308cher.de", "xn--bucher-de-hkf"}, + + // Check some that differ between IDNA 2003 and IDNA 2008. + // https://unicode.org/reports/tr46/#Deviations + // https://util.unicode.org/UnicodeJsps/idna.jsp + {"faß.de", "xn--fa-de-mqa"}, + {"βόλοσ.com", "xn---com-4ld8c2a6a8e"}, + + // Lengths of 63 and 64. 64 is too long for a DNS label, but + // domainPrefixBasic is not expected to check for that. + {"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"}, + {"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"}, + + // https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors... + {"example.com", "example-com"}, + {"foo.example.com", "foo-example-com"}, + {"foo-example.com", "foo--example-com"}, + {"xn--57hw060o.com", "xn---com-p33b41770a"}, + {"\u26a1\U0001f60a.com", "xn---com-p33b41770a"}, + {"en-us.example.com", "0-en--us-example-com-0"}, + } { + output, err := domainPrefixBasic(test.domain) + if err != nil || output != test.expected { + t.Errorf("%+q → (%+q, %v), expected (%+q, %v)", + test.domain, output, err, test.expected, nil) + } + } + + // Tests expecting an error. + for _, domain := range []string{ + "xn---", + } { + output, err := domainPrefixBasic(domain) + if err == nil || output != "" { + t.Errorf("%+q → (%+q, %v), expected (%+q, non-nil)", + domain, output, err, "") + } + } +} + +func TestDomainPrefixFallback(t *testing.T) { + for _, test := range []struct { + domain, expected string + }{ + { + "", + "4oymiquy7qobjgx36tejs35zeqt24qpemsnzgtfeswmrw6csxbkq", + }, + { + "example.com", + "un42n5xov642kxrxrqiyanhcoupgql5lt4wtbkyt2ijflbwodfdq", + }, + + // These checked against the output of + // https://github.com/ampproject/amp-toolbox/tree/84cb3057e5f6c54d64369ddd285db..., + // using the widget at + // https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors.... + { + "000000000000000000000000000000000000000000000000000000000000.com", + "stejanx4hsijaoj4secyecy4nvqodk56kw72whwcmvdbtucibf5a", + }, + { + "00000000000000000000000000000000000000000000000000000000000a.com", + "jdcvbsorpnc3hcjrhst56nfm6ymdpovlawdbm2efyxpvlt4cpbya", + }, + { + "00000000000000000000000000000000000000000000000000000000000\u03bb.com", + "qhzqeumjkfpcpuic3vqruyjswcr7y7gcm3crqyhhywvn3xrhchfa", + }, + } { + output := domainPrefixFallback(test.domain) + if output != test.expected { + t.Errorf("%+q → %+q, expected %+q", + test.domain, output, test.expected) + } + } +} + +// Checks that domainPrefix chooses domainPrefixBasic or domainPrefixFallback as +// appropriate; i.e., always returns string that is a valid DNS label and is +// IDNA-decodable. +func TestDomainPrefix(t *testing.T) { + // A validating IDNA profile, which checks label length and that the + // label contains only certain ASCII characters. It does not do the + // ValidateLabels check, because that depends on the input having + // certain properties. + profile := idna.New( + idna.VerifyDNSLength(true), + idna.StrictDomainName(true), + ) + for _, domain := range []string{ + "example.com", + "\u0314example.com", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", // 63 bytes + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", // 64 bytes + "xn--57hw060o.com", + "a b c", + } { + output := domainPrefix(domain) + if bytes.IndexByte([]byte(output), '.') != -1 { + t.Errorf("%+q → %+q contains a dot", domain, output) + } + _, err := profile.ToUnicode(output) + if err != nil { + t.Errorf("%+q → error %v", domain, err) + } + } +} + +func mustParseURL(rawurl string) *url.URL { + u, err := url.Parse(rawurl) + if err != nil { + panic(err) + } + return u +} + +func TestCacheURL(t *testing.T) { + // Tests expecting no error. + for _, test := range []struct { + pub string + cache string + contentType string + expected string + }{ + // With or without trailing slash on pubURL. + { + "http://example.com/", + "https://amp.cache/", + "c", + "https://example-com.amp.cache/c/example.com", + }, + { + "http://example.com", + "https://amp.cache/", + "c", + "https://example-com.amp.cache/c/example.com", + }, + // https pubURL. + { + "https://example.com/", + "https://amp.cache/", + "c", + "https://example-com.amp.cache/c/s/example.com", + }, + // The content type should be escaped if necessary. + { + "http://example.com/", + "https://amp.cache/", + "/", + "https://example-com.amp.cache/%2F/example.com", + }, + // Retain pubURL path, query, and fragment, including escaping. + { + "http://example.com/my%2Fpath/index.html?a=1#fragment", + "https://amp.cache/", + "c", + "https://example-com.amp.cache/c/example.com/my%2Fpath/index.html?a=1#fragmen...", + }, + // Retain scheme, userinfo, port, and path of cacheURL, escaping + // whatever is necessary. + { + "http://example.com", + "http://cache%2Fuser:cache%40pass@amp.cache:123/with/../../path/..%2f../", + "c", + "http://cache%2Fuser:cache%40pass@example-com.amp.cache:123/path/..%2f../c/ex...", + }, + // Port numbers in pubURL are allowed, if they're the default + // for scheme. + { + "http://example.com:80/", + "https://amp.cache/", + "c", + "https://example-com.amp.cache/c/example.com", + }, + { + "https://example.com:443/", + "https://amp.cache/", + "c", + "https://example-com.amp.cache/c/s/example.com", + }, + // "?" at the end of cacheURL is okay, as long as the query is + // empty. + { + "http://example.com/", + "https://amp.cache/?", + "c", + "https://example-com.amp.cache/c/example.com", + }, + + // https://developers.google.com/amp/cache/overview#example-requesting-document... + { + "https://example.com/amp_document.html", + "https://cdn.ampproject.org/", + "c", + "https://example-com.cdn.ampproject.org/c/s/example.com/amp_document.html", + }, + // https://developers.google.com/amp/cache/overview#example-requesting-image-us... + { + "http://example.com/logo.png", + "https://cdn.ampproject.org/", + "i", + "https://example-com.cdn.ampproject.org/i/example.com/logo.png", + }, + // https://developers.google.com/amp/cache/overview#query-parameter-example + { + "https://example.com/g?value=Hello%20World", + "https://cdn.ampproject.org/", + "c", + "https://example-com.cdn.ampproject.org/c/s/example.com/g?value=Hello%20World", + }, + } { + pubURL := mustParseURL(test.pub) + cacheURL := mustParseURL(test.cache) + outputURL, err := CacheURL(pubURL, cacheURL, test.contentType) + if err != nil { + t.Errorf("%+q %+q %+q → error %v", + test.pub, test.cache, test.contentType, err) + continue + } + if outputURL.String() != test.expected { + t.Errorf("%+q %+q %+q → %+q, expected %+q", + test.pub, test.cache, test.contentType, outputURL, test.expected) + continue + } + } + + // Tests expecting an error. + for _, test := range []struct { + pub string + cache string + contentType string + }{ + // Empty content type. + { + "http://example.com/", + "https://amp.cache/", + "", + }, + // Empty host. + { + "http:///index.html", + "https://amp.cache/", + "c", + }, + // Empty scheme. + { + "//example.com/", + "https://amp.cache/", + "c", + }, + // Unrecognized scheme. + { + "ftp://example.com/", + "https://amp.cache/", + "c", + }, + // Wrong port number for scheme. + { + "http://example.com:443/", + "https://amp.cache/", + "c", + }, + // userinfo in pubURL. + { + "http://user@example.com/", + "https://amp.cache/", + "c", + }, + { + "http://user:pass@example.com/", + "https://amp.cache/", + "c", + }, + // cacheURL may not contain a query. + { + "http://example.com/", + "https://amp.cache/?a=1", + "c", + }, + // cacheURL may not contain a fragment. + { + "http://example.com/", + "https://amp.cache/#fragment", + "c", + }, + } { + pubURL := mustParseURL(test.pub) + cacheURL := mustParseURL(test.cache) + outputURL, err := CacheURL(pubURL, cacheURL, test.contentType) + if err == nil { + t.Errorf("%+q %+q %+q → %+q, expected error", + test.pub, test.cache, test.contentType, outputURL) + continue + } + } +} diff --git a/common/amp/doc.go b/common/amp/doc.go new file mode 100644 index 0000000..1387114 --- /dev/null +++ b/common/amp/doc.go @@ -0,0 +1,88 @@ +/* +Package amp provides functions for working with the AMP (Accelerated Mobile +Pages) subset of HTML, and conveying binary data through an AMP cache. + +AMP cache + +The CacheURL function takes a plain URL and converts it to be accessed through a +given AMP cache. + +The EncodePath and DecodePath functions provide a way to encode data into the +suffix of a URL path. AMP caches do not support HTTP POST, but encoding data +into a URL path with GET is an alternative means of sending data to the server. +The format of an encoded path is: + 0<0 or more bytes, including slash>/<base64 of data> +That is: +* "0", a format version number, which controls the interpretation of the rest of +the path. Only the first byte matters as a version indicator (not the whole +first path component). +* Any number of slash or non-slash bytes. These may be used as padding or to +prevent cache collisions in the AMP cache. +* A final slash. +* base64 encoding of the data, using the URL-safe alphabet (which does not +include slash). + +For example, an encoding of the string "This is path-encoded data." is the +following. The "lgWHcwhXFjUm" following the format version number is random +padding that will be ignored on decoding. + 0lgWHcwhXFjUm/VGhpcyBpcyBwYXRoLWVuY29kZWQgZGF0YS4 + +It is the caller's responsibility to add or remove any directory path prefix +before calling EncodePath or DecodePath. + +AMP armor + +AMP armor is a data encoding scheme that that satisfies the requirements of the +AMP (Accelerated Mobile Pages) subset of HTML, and survives modification by an +AMP cache. For the requirements of AMP HTML, see +https://amp.dev/documentation/guides-and-tutorials/learn/spec/amphtml/. +For modifications that may be made by an AMP cache, see +https://github.com/ampproject/amphtml/blob/main/docs/spec/amp-cache-modifica.... + +The encoding is based on ones created by Ivan Markin. See codec/amp/ in +https://github.com/nogoegst/amper and discussion at +https://bugs.torproject.org/tpo/anti-censorship/pluggable-transports/snowfla.... + +The encoding algorithm works as follows. Base64-encode the input. Prepend the +input with the byte '0'; this is a protocol version indicator that the decoder +can use to determine how to interpret the bytes that follow. Split the base64 +into fixed-size chunks separated by whitespace. Take up to 1024 chunks at a +time, and wrap them in a pre element. Then, situate the markup so far within the +body of the AMP HTML boilerplate. The decoding algorithm is to scan the HTML for +pre elements, split their text contents on whitespace and concatenate, then +base64 decode. The base64 encoding uses the standard alphabet, with normal "=" +padding (https://tools.ietf.org/html/rfc4648#section-4). + +The reason for splitting the base64 into chunks is that AMP caches reportedly +truncate long strings that are not broken by whitespace: +https://bugs.torproject.org/tpo/anti-censorship/pluggable-transports/snowfla.... +The characters that may separate the chunks are the ASCII whitespace characters +(https://infra.spec.whatwg.org/#ascii-whitespace) "\x09", "\x0a", "\x0c", +"\x0d", and "\x20". The reason for separating the chunks into pre elements is to +limit the amount of text a decoder may have to buffer while parsing the HTML. +Each pre element may contain at most 64 KB of text. pre elements may not be +nested. + +Example + +The following is the result of encoding the string +"This was encoded with AMP armor.": + + <!doctype html> + <html amp> + <head> + <meta charset="utf-8"> + <script async src="https://cdn.ampproject.org/v0.js"></script> + <link rel="canonical" href="#"> + <meta name="viewport" content="width=device-width"> + <style amp-boilerplate>body{-webkit-animation:-amp-start 8s steps(1,end) 0s 1 normal both;-moz-animation:-amp-start 8s steps(1,end) 0s 1 normal both;-ms-animation:-amp-start 8s steps(1,end) 0s 1 normal both;animation:-amp-start 8s steps(1,end) 0s 1 normal both}@-webkit-keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}@-moz-keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}@-ms-keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}@-o-keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}@keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}</style><noscript><style amp-boilerplate>body{-webkit-animation:none;-moz-animation:none;-ms-animation:none;animation:none}</style></noscript> + </head> + <body> + <pre> + 0VGhpcyB3YXMgZW5jb2RlZCB3aXRoIEF + NUCBhcm1vci4= + </pre> + </body> + </html> +*/ +package amp diff --git a/common/amp/path.go b/common/amp/path.go new file mode 100644 index 0000000..5903694 --- /dev/null +++ b/common/amp/path.go @@ -0,0 +1,44 @@ +package amp + +import ( + "crypto/rand" + "encoding/base64" + "fmt" + "strings" +) + +// EncodePath encodes data in a way that is suitable for the suffix of an AMP +// cache URL. +func EncodePath(data []byte) string { + var cacheBreaker [9]byte + _, err := rand.Read(cacheBreaker[:]) + if err != nil { + panic(err) + } + b64 := base64.RawURLEncoding.EncodeToString + return "0" + b64(cacheBreaker[:]) + "/" + b64(data) +} + +// DecodePath decodes data from a path suffix as encoded by EncodePath. The path +// must have already been trimmed of any directory prefix (as might be present +// in, e.g., an HTTP request). That is, the first character of path should be +// the "0" message format indicator. +func DecodePath(path string) ([]byte, error) { + if len(path) < 1 { + return nil, fmt.Errorf("missing format indicator") + } + version := path[0] + rest := path[1:] + switch version { + case '0': + // Ignore everything else up to and including the final slash + // (there must be at least one slash). + i := strings.LastIndexByte(rest, '/') + if i == -1 { + return nil, fmt.Errorf("missing data") + } + return base64.RawURLEncoding.DecodeString(rest[i+1:]) + default: + return nil, fmt.Errorf("unknown format indicator %q", version) + } +} diff --git a/common/amp/path_test.go b/common/amp/path_test.go new file mode 100644 index 0000000..20e4ccf --- /dev/null +++ b/common/amp/path_test.go @@ -0,0 +1,54 @@ +package amp + +import ( + "testing" +) + +func TestDecodePath(t *testing.T) { + for _, test := range []struct { + path string + expectedData string + expectedErrStr string + }{ + {"", "", "missing format indicator"}, + {"0", "", "missing data"}, + {"0foobar", "", "missing data"}, + {"/0/YWJj", "", "unknown format indicator '/'"}, + + {"0/", "", ""}, + {"0foobar/", "", ""}, + {"0/YWJj", "abc", ""}, + {"0///YWJj", "abc", ""}, + {"0foobar/YWJj", "abc", ""}, + {"0/foobar/YWJj", "abc", ""}, + } { + data, err := DecodePath(test.path) + if test.expectedErrStr != "" { + if err == nil || err.Error() != test.expectedErrStr { + t.Errorf("%+q expected error %+q, got %+q", + test.path, test.expectedErrStr, err) + } + } else if err != nil { + t.Errorf("%+q expected no error, got %+q", test.path, err) + } else if string(data) != test.expectedData { + t.Errorf("%+q expected data %+q, got %+q", + test.path, test.expectedData, data) + } + } +} + +func TestPathRoundTrip(t *testing.T) { + for _, data := range []string{ + "", + "\x00", + "/", + "hello world", + } { + decoded, err := DecodePath(EncodePath([]byte(data))) + if err != nil { + t.Errorf("%+q roundtripped with error %v", data, err) + } else if string(decoded) != data { + t.Errorf("%+q roundtripped to %+q", data, decoded) + } + } +}
tor-commits@lists.torproject.org