unofficial mirror of bug-gnu-emacs@gnu.org 
 help / color / mirror / code / Atom feed
From: "Sebastián Monía" <sebastian@sebasmonia.com>
To: Jim Porter <jporterbugs@gmail.com>
Cc: Eli Zaretskii <eliz@gnu.org>, 73133@debbugs.gnu.org, ganimard@tuta.io
Subject: bug#73133: 29.2; EWW fails to render some webpages
Date: Tue, 08 Oct 2024 23:30:03 -0400	[thread overview]
Message-ID: <87y12y7y2s.fsf@sebasmonia.com> (raw)
In-Reply-To: <5e49a521-a191-15db-6368-6ca0f046d68a@gmail.com> (Jim Porter's message of "Thu, 3 Oct 2024 16:39:06 -0700")

[-- Attachment #1: Type: text/plain, Size: 1076 bytes --]


Jim Porter <jporterbugs@gmail.com> writes:
> On 9/30/2024 10:10 AM, Sebastián Monía wrote:
>> We aren't really guessing the content-type, at least in the scope of my
>> original patch, and probably this bug. We just want to know if the page
>> is HTML to render it, in these snippets (part of eww-render):
>
> What I was thinking about was something like this (with some
> appropriate implementation for 'eww--guess-content-type', possibly
> accepting args as needed):
>
> diff --git a/lisp/net/eww.el b/lisp/net/eww.el
> index b5d2f20781a..1c134717cc9 100644
> --- a/lisp/net/eww.el
> +++ b/lisp/net/eww.el
> @@ -659,7 +659,7 @@ eww-render
>  	 (content-type
>  	  (mail-header-parse-content-type
>             (if (zerop (length (cdr (assoc "content-type" headers))))
> -	       "text/plain"
> +	       (eww--guess-content-type)
>               (cdr (assoc "content-type" headers)))))
>  	 (charset (intern
>  		   (downcase
Hello!

Attached a new patch that goes in the direction outlined above, let me
know what you think.

Cheers,
Seb


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: patch --]
[-- Type: text/x-patch, Size: 3056 bytes --]

From 309a7d729665f14964a550f57f589a79705e23d6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sebasti=C3=A1n=20Mon=C3=ADa?= <sebastian@sebasmonia.com>
Date: Tue, 8 Oct 2024 23:26:42 -0400
Subject: [PATCH] Add customization to let EWW guess content-type if needed
 (bug#73133)

---
 lisp/net/eww.el | 40 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/lisp/net/eww.el b/lisp/net/eww.el
index b5d2f20781a..0a9a621f3e5 100644
--- a/lisp/net/eww.el
+++ b/lisp/net/eww.el
@@ -108,6 +108,19 @@ eww-suggest-uris
              eww-current-url
              eww-bookmark-urls))
 
+(defcustom eww-guess-content-type-functions
+  '(eww--html-if-doctype)
+  "List of functions used to guess a page's content-type.
+These are only used when the page does not have a valid Content-Type
+header.  Functions are called in order, until one of them returns the
+value to be used as Content-Type.  They receive two parameters: an alist
+of headers, and the buffer that holds the complete response.  If the
+list is exhausted, eww assumes \"text/plain\" so the user can see the
+markup."
+  :version "31.1"
+  :group 'eww
+  :type '(repeat function))
+
 (defcustom eww-bookmarks-directory user-emacs-directory
   "Directory where bookmark files will be stored."
   :version "25.1"
@@ -630,6 +643,31 @@ eww-html-p
   (member content-type '("text/html"
 			 "application/xhtml+xml")))
 
+(defun eww--guess-content-type (headers response-buffer)
+  "Use HEADERS and RESPONSE to guess the Content-Type.
+Will call each function in `eww-guess-content-type-functions', until one
+of them returns a value.  This mechanism is used only if there isn't a
+valid Content-Type header.  If none of the functions can guess, return
+\"text/plain\", so at least the mark up is displayed."
+  (let ((first-guess (seq-some
+                      (lambda (f) (funcall f headers response-buffer))
+                      eww-guess-content-type-functions)))
+    (or first-guess "text/plain")))
+
+(defun eww--html-if-doctype (headers response-buffer)
+  "Return \"text/html\" if RESPONSE-BUFFER has an HTML doctype declaration.
+HEADERS is unused."
+  ;; https://html.spec.whatwg.org/multipage/syntax.html#the-doctype
+  (let ((case-fold-search t)
+        (target
+         "<!doctype +html *\\(>\\|system +\\(\\\"\\|'\\)+about:legacy-compat\\)"))
+    (with-current-buffer response-buffer
+      (goto-char (point-min))
+      ;; match basic <!doctype html> and also legacy variants as
+      ;; specified in link above
+      (when (re-search-forward target nil t)
+        "text/html"))))
+
 (defun eww--rename-buffer ()
   "Rename the current EWW buffer.
 The renaming scheme is performed in accordance with
@@ -659,7 +697,7 @@ eww-render
 	 (content-type
 	  (mail-header-parse-content-type
            (if (zerop (length (cdr (assoc "content-type" headers))))
-	       "text/plain"
+               (eww--guess-content-type headers buffer)
              (cdr (assoc "content-type" headers)))))
 	 (charset (intern
 		   (downcase
-- 
2.43.0


  reply	other threads:[~2024-10-09  3:30 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-09-08 20:52 bug#73133: 29.2; EWW fails to render some webpages Ganimard via Bug reports for GNU Emacs, the Swiss army knife of text editors
2024-09-10  6:06 ` Jim Porter
2024-09-21  9:13   ` Eli Zaretskii
2024-09-21 17:12     ` Jim Porter
2024-09-23 15:43       ` Sebastián Monía
2024-09-28 10:58         ` Eli Zaretskii
2024-09-30 15:52           ` Sebastián Monía
2024-09-23 15:56       ` Sebastián Monía
2024-09-24 18:31         ` Jim Porter
2024-09-25 20:46           ` Sebastián Monía
2024-09-26  1:59             ` Jim Porter
2024-09-30 17:10               ` Sebastián Monía
2024-10-03 23:39                 ` Jim Porter
2024-10-09  3:30                   ` Sebastián Monía [this message]
2024-10-09  3:42                     ` Jim Porter
2024-10-10  2:08                       ` Sebastián Monía
2024-10-14  4:35                         ` Jim Porter
2024-10-14 14:03                           ` Eli Zaretskii
2024-10-15 11:43                             ` Sebastián Monía

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://www.gnu.org/software/emacs/

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=87y12y7y2s.fsf@sebasmonia.com \
    --to=sebastian@sebasmonia.com \
    --cc=73133@debbugs.gnu.org \
    --cc=eliz@gnu.org \
    --cc=ganimard@tuta.io \
    --cc=jporterbugs@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://git.savannah.gnu.org/cgit/emacs.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).