diff --git a/lisp/calendar/parse-date.el b/lisp/calendar/parse-date.el new file mode 100644 index 0000000000..c4b756cf2e --- /dev/null +++ b/lisp/calendar/parse-date.el @@ -0,0 +1,281 @@ +;;; parse-date.el --- parsing time/date strings -*- lexical-binding: t -*- + +;; Copyright (C) 2021 Free Software Foundation, Inc. + +;; Author: Bob Rogers +;; Keywords: util + +;; This file is part of GNU Emacs. + +;; GNU Emacs is free software: you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation, either version 3 of the License, or +;; (at your option) any later version. + +;; GNU Emacs is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. + +;; You should have received a copy of the GNU General Public License +;; along with GNU Emacs. If not, see . + +;;; Commentary: + +;; 'parse-date' parses a time and/or date in a string and returns a +;; list of values, just like `decode-time', where unspecified elements +;; in the string are returned as nil (except unspecified DST is +;; returned as -1). `encode-time' may be applied on these values to +;; obtain an internal time value. If left to its own devices, it +;; accepts a wide variety of formats, but can be told to insist on a +;; particular date/time format. + +;; Historically, `parse-time-string' was used for this purpose, but it +;; was focused on email date formats, and gradually but imperfectly +;; extended to handle other formats. 'parse-date' is compatible in +;; that it parses the same input formats and uses the same return +;; value format, but is stricter in that it signals an error for +;; tokens that `parse-time-string' would simply ignore. + +;;; TODO: +;; +;; * Define and signal a date-error for parsing issues. +;; +;; * Implement rfc2822 and rfc822 independently of parse-time-string. +;; +;; * Add a euro-date format for DD/MM/YYYY ? +;; + +;;; Code: + +(require 'cl-lib) +(require 'iso8601) +(require 'parse-time) + +(defun parse-date-guess-format (time-string) + (cond ((string-match "[0-9]T[0-9]" time-string) 'iso8601) + (t nil))) + +(defun parse-date-ignore-char? (char) + (or (eq char ?\ ) (eq char ?,) (eq char ?,))) + +(defun parse-date-tokenize-string (string) + "Turn STRING into tokens, separated only by whitespace and commas. +Multiple commas are ignored. Pure digit sequences are turned +into integers." + (let ((index 0) + (end (length string)) + (char nil) + (list ())) + ;; Skip leading ignored characters. + (while (and (< index end) + (setq char (aref string index)) + (parse-date-ignore-char? char)) + (cl-incf index)) + (while (< index end) + (let ((start index) + (all-digits (<= ?0 char ?9))) + ;; char is valid; look for more valid characters. + (while (and (< (cl-incf index) end) + (setq char (aref string index)) + (not (parse-date-ignore-char? char))) + (unless (<= ?0 char ?9) + (setq all-digits nil))) + (when (<= index end) + (push (if all-digits + (cl-parse-integer string :start start :end index) + (substring string start index)) + list) + ;; Skip ignored characters. + (while (and (< (cl-incf index) end) + (setq char (aref string index)) + (parse-date-ignore-char? char)) + ()) + ;; Next token. + ))) + (nreverse list))) + +(defconst parse-date-slot-names + '(second minute hour day month year weekday dst zone) + "Names of return value slots, for better error messages +See the decoded-time defstruct.") + +(defconst parse-date-slot-ranges + '((0 59) (0 59) (0 23) (1 31) (1 12) (1 9999)) + "Numeric slot ranges, for bounds checking.") + +(defun parse-date-default (time-string two-digit-year?) + ;; Do the standard parsing thing. This is mostly free form, in that + ;; tokens may appear in any order, but we expect to introduce some + ;; state dependence. + (let ((tokens (parse-date-tokenize-string (downcase time-string))) + (time (list nil nil nil nil nil nil nil -1 nil))) + (cl-flet ((set-matched-slot (slot index token) + ;; Assign a slot value from match data if index is + ;; non-nil, else from token, signalling an error if + ;; it's already been assigned or is out of range. + (let ((value (if index + (cl-parse-integer (match-string index token)) + token)) + (range (nth slot parse-date-slot-ranges))) + (unless (equal (nth slot time) + (if (= slot 7) -1 nil)) + (error "Duplicate %s slot value '%s'" + (nth slot parse-date-slot-names) token)) + (when (and range + (not (<= (car range) value (cadr range)))) + (error "Value %s is out of range for %s" + token (nth slot parse-date-slot-names))) + (setf (nth slot time) value)))) + (while tokens + (let ((token (pop tokens)) + (match nil)) + (cond ((numberp token) + ;; A bare number could be a month, day, or year. + ;; The order of these tests matters greatly. + (cond ((>= token 1000) + (set-matched-slot 5 nil token)) + ((and (<= 1 token 31) + (not (nth 3 time))) + ;; Assume days come before months or years. + (set-matched-slot 3 nil token)) + ((and (<= 1 token 12) + (not (nth 4 time))) + ;; Assume days come before years. + (set-matched-slot 4 nil token)) + ((or (nth 5 time) + (not two-digit-year?) + (> token 100)) + (error "Unrecognized numeric value %s" token)) + ;; It's a two-digit year. + ((>= token 50) + ;; second half of the 20th century. + (set-matched-slot 5 nil (+ 1900 token))) + (t + ;; first half of the 21st century. + (set-matched-slot 5 nil (+ 2000 token))))) + ((setq match (assoc token parse-time-weekdays)) + (set-matched-slot 6 nil (cdr match))) + ((setq match (assoc token parse-time-months)) + (set-matched-slot 4 nil (cdr match))) + ((setq match (assoc token parse-time-zoneinfo)) + (set-matched-slot 8 nil (cadr match)) + (set-matched-slot 7 nil (caddr match))) + ((string-match "^[-+][0-9][0-9][0-9][0-9]$" token) + ;; Numeric time zone. + (set-matched-slot + 8 nil + (* 60 + (+ (cl-parse-integer token :start 3 :end 5) + (* 60 (cl-parse-integer token :start 1 :end 3))) + (if (= (aref token 0) ?-) -1 1)))) + ((string-match + "^\\([0-9][0-9][0-9][0-9]\\)[-/]\\([0-9][0-9]?\\)[-/]\\([0-9][0-9]?\\)$" + token) + ;; ISO-8601-style date (YYYY-MM-DD). + (set-matched-slot 5 1 token) + (set-matched-slot 4 2 token) + (set-matched-slot 3 3 token)) + ((string-match + "^\\([0-9][0-9]?\\)[-/]\\([0-9][0-9]?\\)[-/]\\([0-9][0-9][0-9][0-9]\\)$" + token) + ;; US date (MM-DD-YYYY), but we insist on four + ;; digits for the year. + (set-matched-slot 4 1 token) + (set-matched-slot 3 2 token) + (set-matched-slot 5 3 token)) + ((string-match + "^\\([0-9][0-9]?\\):\\([0-9][0-9]\\):\\([0-9][0-9]\\)$" + token) + (set-matched-slot 2 1 token) + (set-matched-slot 1 2 token) + (set-matched-slot 0 3 token)) + ((string-match "^\\([0-9][0-9]?\\):\\([0-9][0-9]\\)$" token) + ;; Time without seconds. + (set-matched-slot 2 1 token) + (set-matched-slot 1 2 token) + (set-matched-slot 0 nil 0)) + ((member token '("am" "pm")) + (unless (nth 2 time) + (error "'AM'/'PM' specified before or without time")) + (unless (<= (nth 2 time) 12) + (error "'AM'/'PM' specified for time already past noon")) + (when (equal token "pm") + (cl-incf (nth 2 time) 12))) + (t + (error "Unrecognized time token '%s'" token)))))) + time)) + +;;;###autoload +(defun parse-date (time-string &optional format) + "Parse TIME-STRING according to FORMAT, returning a list. +The FORMAT value is a symbol that may be one of the following: + + iso8601 => parse the string according to the ISO-8601 +standard. See `parse-iso8601-time-string'. + + iso-8601 => synonym for iso8601. + + rfc822 => parse an RFC822 (old email) date, which allows +two-digit years and internal '()' comments. In dates of the form +'11 Jan 12', the 11 is assumed to be the day, and the 12 is +assumed to mean 2012. [not fully implemented.] + + rfc2822 => parse an RFC2822 (new email) date, which allows +only four-digit years. [not implemented.] + + us-date => parse a US-style date, of the form MM/DD/YYYY, but +allowing two-digit years. In dates of the form '01/11/12', the 1 +is the month, 11 is the day, and the 12 is assumed to mean 2012. +[not fully implemented.] + + nil => attempt to guess the format, falling back on us-date +with two-digit years disallowed. + +The default is nil, and anything else is assumed to be us-date +with two-digit years disallowed. + + * For all formats except iso8601, parsing is case-insensitive. + + * Commas and whitespace are ignored. + + * In date specifications, either '/' or '-' may be used to +separate components, but all three components must be given. + + * A date that starts with four digits is YYYY-MM-DD, ISO-8601 +style, but a date that ends with four digits is MM-DD-YYYY [at +least in us-date format]. + + * Two digit years, when allowed, are in the 1900's when +between 50 and 99 and in the 2000's when between 0 and 49. + +Errors are signalled when time values are duplicated, +unrecognized, or out of range. No consistency checks between +fields are done. For instance, the weekday is not checked to see +that it corresponds to the date, and parse-date complains about +the 32nd of March (or any other month) but blithely accepts the +29th of February in non-leap years -- or the 31st of February in +any year. + +The result is a list of (SEC MIN HOUR DAY MON YEAR DOW DST TZ), +which can be accessed as a decoded-time defstruct (q.v.), +e.g. `decoded-time-year' to extract the year, and turned into an +Emacs timestamp by `encode-time'. The values returned are +identical to those of `decode-time', but any unknown values other +than DST are returned as nil, and an unknown DST value is +returned as -1." + (cl-case (or format (parse-date-guess-format time-string)) + ((iso8601 iso-8601) + (parse-iso8601-time-string time-string)) + ((rfc822 rfc2822) + ;; [Placeholder; we eventually want something more strict. -- + ;; rgr, 20-Dec-21.] + (parse-time-string time-string)) + (us-date + (parse-date-default time-string t)) + (t + (parse-date-default time-string nil)))) + +(provide 'parse-date) + +;;; parse-date.el ends here diff --git a/test/lisp/calendar/parse-date-tests.el b/test/lisp/calendar/parse-date-tests.el new file mode 100644 index 0000000000..682365e674 --- /dev/null +++ b/test/lisp/calendar/parse-date-tests.el @@ -0,0 +1,164 @@ +;;; parse-date-tests.el --- Test suite for parse-date.el -*- lexical-binding:t -*- + +;; Copyright (C) 2016-2021 Free Software Foundation, Inc. + +;; Author: Lars Ingebrigtsen + +;; This file is part of GNU Emacs. + +;; GNU Emacs is free software: you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation, either version 3 of the License, or +;; (at your option) any later version. + +;; GNU Emacs is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. + +;; You should have received a copy of the GNU General Public License +;; along with GNU Emacs. If not, see . + +;;; Commentary: + +;;; Code: + +(require 'ert) +(require 'parse-date) + +(ert-deftest parse-date-tests () + "Test basic parse-date functionality." + + ;; Test tokenization. + (should (equal (parse-date-tokenize-string " ") '())) + (should (equal (parse-date-tokenize-string " a b") '("a" "b"))) + (should (equal (parse-date-tokenize-string "a bbc dde") '("a" "bbc" "dde"))) + (should (equal (parse-date-tokenize-string " , a 27 b,, c 14:32 ") + '("a" 27 "b" "c" "14:32"))) + + ;; Start with some RFC822 dates. + (dolist (format '(nil rfc822)) + (should (equal (parse-date "Mon, 22 Feb 2016 19:35:42 +0100" format) + '(42 35 19 22 2 2016 1 -1 3600))) + (should (equal (parse-date "22 Feb 2016 19:35:42 +0100" format) + '(42 35 19 22 2 2016 nil -1 3600))) + (should (equal (parse-date "22 Feb 2016 +0100" format) + '(nil nil nil 22 2 2016 nil -1 3600))) + (should (equal (parse-date "Mon, 22 February 2016 19:35:42 +0100" format) + '(42 35 19 22 2 2016 1 -1 3600))) + (should (equal (parse-date "Mon, 22 feb 2016 19:35:42 +0100" format) + '(42 35 19 22 2 2016 1 -1 3600))) + (should (equal (parse-date "Monday, 22 february 2016 19:35:42 +0100" format) + '(42 35 19 22 2 2016 1 -1 3600))) + (should (equal (parse-date "Monday, 22 february 2016 19:35:42 PST" format) + '(42 35 19 22 2 2016 1 nil -28800))) + (should (equal (parse-date "Friday, 21 Sep 2018 13:47:58 PDT" format) + '(58 47 13 21 9 2018 5 t -25200))) + (should (equal (parse-date "Friday, 21 Sep 2018 13:47:58" format) + '(58 47 13 21 9 2018 5 -1 nil))) + (should (equal (parse-date "Friday, 21 Sep 2018" format) + '(nil nil nil 21 9 2018 5 -1 nil)))) + ;; These are not allowed by the default format. + (should (equal (parse-date "22 Feb 16 19:35:42 +0100" 'rfc822) + '(42 35 19 22 2 2016 nil -1 3600))) + (should (equal (parse-date "22 Feb 96 19:35:42 +0100" 'rfc822) + '(42 35 19 22 2 1996 nil -1 3600))) + + ;; Test the default format with both hyphens and slashes in dates. + (dolist (case '(;; Month can be numeric if date uses hyphens/slashes. + ("Friday, 2018-09-21" (nil nil nil 21 9 2018 5 -1 nil)) + ;; Year can come last if four digits. + ("Friday, 9-21-2018" (nil nil nil 21 9 2018 5 -1 nil)) + ;; Day of week is optional + ("2018-09-21" (nil nil nil 21 9 2018 nil -1 nil)) + ;; The order of date, time, etc., does not matter. + ("13:47:58, +0100, 2018-09-21, Friday" + (58 47 13 21 9 2018 5 -1 3600)) + ;; Month, day, or both, can be a single digit. + ("Friday, 2018-9-08" (nil nil nil 8 9 2018 5 -1 nil)) + ("Friday, 2018-09-8" (nil nil nil 8 9 2018 5 -1 nil)) + ("Friday, 2018-9-8" (nil nil nil 8 9 2018 5 -1 nil)))) + (let ((string (car case)) + (expected (cadr case))) + ;; Test with hyphens. + (should (equal (parse-date string) expected)) + (while (string-match "-" string) + (setq string (replace-match "/" t t string))) + ;; Test with slashes. + (should (equal (parse-date string) expected)))) + + ;; Time by itself is recognized as such. + (should (equal (parse-date "03:47:58") + '(58 47 3 nil nil nil nil -1 nil))) + ;; A leading zero for hours is optional. + (should (equal (parse-date "3:47:58") + '(58 47 3 nil nil nil nil -1 nil))) + ;; Missing seconds are assumed to be zero. + (should (equal (parse-date "3:47") + '(0 47 3 nil nil nil nil -1 nil))) + ;; AM/PM are understood (in any case combination). + (dolist (am '(am AM Am)) + (should (equal (parse-date (format "3:47 %s" am)) + '(0 47 3 nil nil nil nil -1 nil)))) + (dolist (pm '(pm PM Pm)) + (should (equal (parse-date (format "3:47 %s" pm)) + '(0 47 15 nil nil nil nil -1 nil)))) + + ;; Ensure some cases fail. + (should-error (parse-date "22 Feb 196" 'us-date)) ;; bad year + (should-error (parse-date "22 Feb 16 19:35:42")) ;; two-digit year + (should-error (parse-date "22 Feb 96 19:35:42")) ;; two-digit year + (should-error (parse-date "2 Feb 2021 1996")) ;; duplicate year + (should-error (parse-date "2020-1-1 2021")) ;; another duplicate year + (should-error (parse-date "2020-1-1 30")) ;; extra 30 (not a day)) + (should-error (parse-date "2020-1-1 12")) ;; extra 12 (not a month) + (should-error (parse-date "15:47 15:15")) ;; duplicate time + (should-error (parse-date "2020-1-1 +0800 -0800")) ;; duplicate TZ + (should-error (parse-date "15:47 PM")) ;; PM in the afternoon + (should-error (parse-date "2020-1-1 PM")) ;; PM without a time + ;; Range tests. + (should-error (parse-date "2021-12-32")) + (should-error (parse-date "2021-12-0")) + (should-error (parse-date "2021-13-3")) + (should-error (parse-date "0000-12-3")) + (should-error (parse-date "20021 Dec 3")) + (should-error (parse-date "24:21:14")) + (should-error (parse-date "14:60:21")) + (should-error (parse-date "14:21:60")) + + ;; Test ISO-8601 dates. + (dolist (format '(nil iso8601 iso-8601)) + (should (equal (format-time-string + "%Y-%m-%d %H:%M:%S" + (parse-date "1998-09-12T12:21:54-0200" format) t) + "1998-09-12 14:21:54")) + (should (equal (format-time-string + "%Y-%m-%d %H:%M:%S" + (parse-date "1998-09-12T12:21:54-0230" format) t) + "1998-09-12 14:51:54")) + (should (equal (format-time-string + "%Y-%m-%d %H:%M:%S" + (parse-date "1998-09-12T12:21:54-02:00" format) t) + "1998-09-12 14:21:54")) + (should (equal (format-time-string + "%Y-%m-%d %H:%M:%S" + (parse-date "1998-09-12T12:21:54-02" format) t) + "1998-09-12 14:21:54")) + (should (equal (format-time-string + "%Y-%m-%d %H:%M:%S" + (parse-date "1998-09-12T12:21:54+0230" format) t) + "1998-09-12 09:51:54")) + (should (equal (format-time-string + "%Y-%m-%d %H:%M:%S" + (parse-date "1998-09-12T12:21:54+02" format) t) + "1998-09-12 10:21:54")) + (should (equal (format-time-string + "%Y-%m-%d %H:%M:%S" + (parse-date "1998-09-12T12:21:54Z" format) t) + "1998-09-12 12:21:54")) + (should (equal (parse-date "1998-09-12T12:21:54") + (encode-time 54 21 12 12 9 1998))))) + +(provide 'parse-date-tests) + +;;; parse-date-tests.el ends here