Initial commit
This commit is contained in:
513
elpa/haskell-mode-20171022.26/haskell-lexeme.el
Normal file
513
elpa/haskell-mode-20171022.26/haskell-lexeme.el
Normal file
@@ -0,0 +1,513 @@
|
||||
;;; haskell-lexeme.el --- haskell lexical tokens -*- coding: utf-8; lexical-binding: t -*-
|
||||
|
||||
;; Copyright (C) 2015 Gracjan Polak
|
||||
|
||||
;; This file is free software; you can redistribute it and/or modify
|
||||
;; it under the terms of the GNU General Public License as published by
|
||||
;; the Free Software Foundation; either version 3, or (at your option)
|
||||
;; any later version.
|
||||
|
||||
;; This file is distributed in the hope that it will be useful,
|
||||
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
;; GNU General Public License for more details.
|
||||
|
||||
;; You should have received a copy of the GNU General Public License
|
||||
;; along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
;;; Commentary:
|
||||
|
||||
;;; Code:
|
||||
|
||||
(require 'rx)
|
||||
|
||||
(unless (category-docstring ?P)
|
||||
(define-category ?P "Haskell symbol constituent characters")
|
||||
(map-char-table
|
||||
#'(lambda (key val)
|
||||
(if (or
|
||||
(and (consp key) (> (car key) 128))
|
||||
(and (numberp key) (> key 128)))
|
||||
(if (member val '(Pc Pd Po Sm Sc Sk So))
|
||||
(modify-category-entry key ?P))))
|
||||
unicode-category-table)
|
||||
|
||||
(dolist (key (string-to-list "!#$%&*+./<=>?@^|~\\-:"))
|
||||
(modify-category-entry key ?P)))
|
||||
|
||||
(defconst haskell-lexeme-modid
|
||||
"[[:upper:]][[:alnum:]'_]*"
|
||||
"Regexp matching a valid Haskell module identifier.
|
||||
|
||||
Note that GHC accepts Unicode category UppercaseLetter as a first
|
||||
character. Following letters are from Unicode categories
|
||||
UppercaseLetter, LowercaseLetter, OtherLetter, TitlecaseLetter,
|
||||
ModifierLetter, DecimalNumber, OtherNumber, backslash or
|
||||
underscore.")
|
||||
|
||||
(defconst haskell-lexeme-id
|
||||
"[[:alpha:]_][[:alnum:]'_]*"
|
||||
"Regexp matching a valid Haskell identifier.
|
||||
|
||||
GHC accepts a string starting with any alphabetic character or
|
||||
underscore followed by any alphanumeric character or underscore
|
||||
or apostrophe.")
|
||||
|
||||
(defconst haskell-lexeme-sym
|
||||
"\\cP+"
|
||||
"Regexp matching a valid Haskell variable or constructor symbol.
|
||||
|
||||
GHC accepts a string of chars from the set
|
||||
[:!#$%&*+./<=>?@^|~\\-] or Unicode category Symbol for chars with
|
||||
codes larger than 128 only.")
|
||||
|
||||
(defconst haskell-lexeme-idsym-first-char
|
||||
"\\(?:[[:alpha:]_]\\|\\cP\\)"
|
||||
"Regexp matching first character of a qualified or unqualified
|
||||
identifier or symbol.
|
||||
|
||||
Useful for `re-search-forward'.")
|
||||
|
||||
(defconst haskell-lexeme-modid-opt-prefix
|
||||
(concat "\\(?:" haskell-lexeme-modid "\\.\\)*")
|
||||
"Regexp matching a valid Haskell module prefix, potentially empty.
|
||||
|
||||
Module path prefix is separated by dots and finishes with a
|
||||
dot. For path component syntax see `haskell-lexeme-modid'.")
|
||||
|
||||
(defconst haskell-lexeme-qid-or-qsym
|
||||
(rx-to-string `(: (regexp ,haskell-lexeme-modid-opt-prefix)
|
||||
(group (| (regexp ,haskell-lexeme-id) (regexp ,haskell-lexeme-sym)
|
||||
))))
|
||||
"Regexp matching a valid qualified identifier or symbol.
|
||||
|
||||
Note that (match-string 1) returns the unqualified part.")
|
||||
|
||||
(defun haskell-lexeme-looking-at-qidsym ()
|
||||
"Non-nil when point is just in front of an optionally qualified
|
||||
identifier or symbol.
|
||||
|
||||
Using this function is more efficient than matching against the
|
||||
regexp `haskell-lexeme-qid-or-qsym'.
|
||||
|
||||
Returns:
|
||||
'qid - if matched a qualified id: 'Data.Map' or 'Map'
|
||||
'qsym - if matched a qualified id: 'Monad.>>=' or '>>='
|
||||
'qprefix - if matched only modid prefix: 'Data.'
|
||||
|
||||
After successful 'qid or 'qsym match (match-string 1) will return
|
||||
the unqualified part (if any)."
|
||||
(let ((begin (point))
|
||||
(match-data-old (match-data)))
|
||||
(save-excursion
|
||||
(while (looking-at (concat haskell-lexeme-modid "\\."))
|
||||
(goto-char (match-end 0)))
|
||||
(cond
|
||||
((looking-at haskell-lexeme-id)
|
||||
(let ((beg (match-beginning 0))
|
||||
(end (match-end 0)))
|
||||
|
||||
;; check is MagicHash is present at the end of the token
|
||||
(goto-char end)
|
||||
(when (looking-at "#+")
|
||||
(setq end (match-end 0)))
|
||||
|
||||
(set-match-data
|
||||
(list begin end
|
||||
beg end)))
|
||||
'qid)
|
||||
((looking-at haskell-lexeme-sym)
|
||||
(set-match-data
|
||||
(list begin (match-end 0)
|
||||
(match-beginning 0) (match-end 0)))
|
||||
'qsym)
|
||||
((equal begin (point))
|
||||
(set-match-data match-data-old)
|
||||
nil)
|
||||
(t
|
||||
(set-match-data
|
||||
(list begin (point)
|
||||
nil nil))
|
||||
'qprefix)))))
|
||||
|
||||
(defun haskell-lexeme-looking-at-backtick ()
|
||||
"Non-nil when point is just in front of an identifier quoted with backticks.
|
||||
|
||||
When match is successful, match-data will contain:
|
||||
(match-text 1) - opening backtick
|
||||
(match-text 2) - whole qualified identifier
|
||||
(match-text 3) - unqualified part of identifier
|
||||
(match-text 4) - closing backtick"
|
||||
(let ((match-data-old (match-data))
|
||||
first-backtick-start
|
||||
last-backtick-start
|
||||
qid-start
|
||||
id-start
|
||||
id-end
|
||||
result)
|
||||
(save-excursion
|
||||
(when (looking-at "`")
|
||||
(setq first-backtick-start (match-beginning 0))
|
||||
(goto-char (match-end 0))
|
||||
(forward-comment (buffer-size))
|
||||
(when (haskell-lexeme-looking-at-qidsym)
|
||||
(setq qid-start (match-beginning 0))
|
||||
(setq id-start (match-beginning 1))
|
||||
(setq id-end (match-end 1))
|
||||
(goto-char (match-end 0))
|
||||
(forward-comment (buffer-size))
|
||||
(when (looking-at "`")
|
||||
(setq last-backtick-start (match-beginning 0))
|
||||
(set-match-data
|
||||
(mapcar
|
||||
(lambda (p)
|
||||
(set-marker (make-marker) p))
|
||||
(list
|
||||
first-backtick-start (1+ last-backtick-start)
|
||||
first-backtick-start (1+ first-backtick-start)
|
||||
qid-start id-end
|
||||
id-start id-end
|
||||
last-backtick-start (1+ last-backtick-start))))
|
||||
(setq result t)))))
|
||||
(unless result
|
||||
(set-match-data match-data-old))
|
||||
result))
|
||||
|
||||
(defconst haskell-lexeme-qid
|
||||
(rx-to-string `(: (regexp "'*")
|
||||
(regexp ,haskell-lexeme-modid-opt-prefix)
|
||||
(group (regexp ,haskell-lexeme-id))))
|
||||
"Regexp matching a valid qualified identifier.
|
||||
|
||||
Note that (match-string 1) returns the unqualified part.")
|
||||
|
||||
(defconst haskell-lexeme-qsym
|
||||
(rx-to-string `(: (regexp "'*")
|
||||
(regexp ,haskell-lexeme-modid-opt-prefix)
|
||||
(group (regexp ,haskell-lexeme-id))))
|
||||
"Regexp matching a valid qualified symbol.
|
||||
|
||||
Note that (match-string 1) returns the unqualified part.")
|
||||
|
||||
(defconst haskell-lexeme-number
|
||||
(rx (| (: (regexp "[0-9]+\\.[0-9]+") (opt (regexp "[eE][-+]?[0-9]+")))
|
||||
(regexp "[0-9]+[eE][-+]?[0-9]+")
|
||||
(regexp "0[xX][0-9a-fA-F]+")
|
||||
(regexp "0[oO][0-7]+")
|
||||
(regexp "[0-9]+")))
|
||||
"Regexp matching a floating point, decimal, octal or hexadecimal number.
|
||||
|
||||
Note that negative sign char is not part of a number.")
|
||||
|
||||
(defconst haskell-lexeme-char-literal-inside
|
||||
(rx (| (not (any "\n'\\"))
|
||||
(: "\\"
|
||||
(| "a" "b" "f" "n" "r" "t" "v" "\\" "\"" "'"
|
||||
"NUL" "SOH" "STX" "ETX" "EOT" "ENQ" "ACK"
|
||||
"BEL" "BS" "HT" "LF" "VT" "FF" "CR" "SO" "SI" "DLE"
|
||||
"DC1" "DC2" "DC3" "DC4" "NAK" "SYN" "ETB" "CAN"
|
||||
"EM" "SUB" "ESC" "FS" "GS" "RS" "US" "SP" "DEL"
|
||||
(regexp "[0-9]+")
|
||||
(: "x" (regexp "[0-9a-fA-F]+"))
|
||||
(: "o" (regexp "[0-7]+"))
|
||||
(: "^" (regexp "[]A-Z@^_\\[]"))))))
|
||||
"Regexp matching an inside of a character literal.
|
||||
|
||||
Note that `haskell-lexeme-char-literal-inside' matches strictly
|
||||
only escape sequences defined in Haskell Report.")
|
||||
|
||||
(defconst haskell-lexeme--char-literal-rx
|
||||
(rx-to-string `(: (group "'")
|
||||
(| (: (group (regexp "[[:alpha:]_([]")) (group "'")) ; exactly one char
|
||||
(: (group (| (regexp "\\\\[^\n][^'\n]*") ; allow quote just after first backslash
|
||||
(regexp "[^[:alpha:]_(['\n][^'\n]*")))
|
||||
(| (group "'") "\n" (regexp "\\'"))))))
|
||||
"Regexp matching a character literal lookalike.
|
||||
|
||||
Note that `haskell-lexeme--char-literal-rx' matches more than
|
||||
Haskell Report specifies because we want to support also code
|
||||
under edit.
|
||||
|
||||
Character literals end with a quote or a newline or end of
|
||||
buffer.
|
||||
|
||||
Regexp has subgroup expressions:
|
||||
(match-text 1) matches the opening quote.
|
||||
(match-text 2) matches the inside of the character literal.
|
||||
(match-text 3) matches the closing quote or an empty string
|
||||
at the end of line or the end buffer.")
|
||||
|
||||
(defun haskell-lexeme-looking-at-char-literal ()
|
||||
"Non-nil when point is at a char literal lookalike.
|
||||
|
||||
Note that this function matches more than Haskell Report
|
||||
specifies because we want to support also code under edit.
|
||||
|
||||
Char literals end with a quote or an unescaped newline or end
|
||||
of buffer.
|
||||
|
||||
After successful match:
|
||||
(match-text 1) matches the opening quote.
|
||||
(match-text 2) matches the inside of the char literla.
|
||||
(match-text 3) matches the closing quote, or a closing
|
||||
newline or is nil when at the end of the buffer."
|
||||
(when (looking-at haskell-lexeme--char-literal-rx)
|
||||
(set-match-data
|
||||
(list (match-beginning 0) (match-end 0)
|
||||
(match-beginning 1) (match-end 1)
|
||||
(or (match-beginning 2) (match-beginning 4)) (or (match-end 2) (match-end 4))
|
||||
(or (match-beginning 3) (match-beginning 5)) (or (match-end 3) (match-end 5))))
|
||||
t))
|
||||
|
||||
(defconst haskell-lexeme-string-literal-inside-item
|
||||
(rx (| (not (any "\n\"\\"))
|
||||
(: "\\"
|
||||
(| "a" "b" "f" "n" "r" "t" "v" "\\" "\"" "'" "&"
|
||||
"NUL" "SOH" "STX" "ETX" "EOT" "ENQ" "ACK"
|
||||
"BEL" "BS" "HT" "LF" "VT" "FF" "CR" "SO" "SI" "DLE"
|
||||
"DC1" "DC2" "DC3" "DC4" "NAK" "SYN" "ETB" "CAN"
|
||||
"EM" "SUB" "ESC" "FS" "GS" "RS" "US" "SP" "DEL"
|
||||
(regexp "[0-9]+")
|
||||
(: "x" (regexp "[0-9a-fA-F]+"))
|
||||
(: "o" (regexp "[0-7]+"))
|
||||
(: "^" (regexp "[]A-Z@^_\\[]"))
|
||||
(regexp "[ \t\n\r\v\f]*\\\\")))))
|
||||
"Regexp matching an item that is a single character or a single
|
||||
escape sequence inside of a string literal.
|
||||
|
||||
Note that `haskell-lexeme-string-literal-inside-item' matches
|
||||
strictly only escape sequences defined in Haskell Report.")
|
||||
|
||||
(defconst haskell-lexeme-string-literal
|
||||
(rx (: (group "\"")
|
||||
(group (* (| (regexp "\\\\[ \t\n\r\v\f]*\\\\")
|
||||
(regexp "\\\\[ \t\n\r\v\f]+")
|
||||
(regexp "\\\\[^ \t\n\r\v\f]")
|
||||
(* (regexp "[^\"\n\\]")))))
|
||||
(group (| "\"" (regexp "$") (regexp "\\\\?\\'")
|
||||
))))
|
||||
"Regexp matching a string literal lookalike.
|
||||
|
||||
Note that `haskell-lexeme-string-literal' matches more than
|
||||
Haskell Report specifies because we want to support also code
|
||||
under edit.
|
||||
|
||||
String literals end with double quote or unescaped newline or end
|
||||
of buffer.
|
||||
|
||||
Regexp has subgroup expressions:
|
||||
(match-text 1) matches the opening double quote.
|
||||
(match-text 2) matches the inside of the string.
|
||||
(match-text 3) matches the closing double quote or an empty string
|
||||
at the end of line or the end buffer.")
|
||||
|
||||
(defun haskell-lexeme-looking-at-string-literal ()
|
||||
"Non-nil when point is at a string literal lookalike.
|
||||
|
||||
Note that this function matches more than Haskell Report
|
||||
specifies because we want to support also code under edit.
|
||||
|
||||
String literals end with double quote or unescaped newline or end
|
||||
of buffer.
|
||||
|
||||
After successful match:
|
||||
(match-text 1) matches the opening doublequote.
|
||||
(match-text 2) matches the inside of the string.
|
||||
(match-text 3) matches the closing quote, or a closing
|
||||
newline or is nil when at the end of the buffer."
|
||||
(when (looking-at "\"")
|
||||
(save-excursion
|
||||
(let ((begin (point)))
|
||||
(goto-char (match-end 0))
|
||||
(let (finish)
|
||||
(while (and (not finish)
|
||||
(re-search-forward "[\"\n\\]" nil 'goto-eob))
|
||||
(cond
|
||||
((equal (match-string 0) "\\")
|
||||
(if (looking-at "[ \t\n\r\v\f]+\\\\?")
|
||||
(goto-char (match-end 0))
|
||||
(goto-char (1+ (point)))))
|
||||
|
||||
((equal (match-string 0) "\"")
|
||||
(set-match-data
|
||||
(list begin (match-end 0)
|
||||
begin (1+ begin)
|
||||
(1+ begin) (match-beginning 0)
|
||||
(match-beginning 0) (match-end 0)))
|
||||
(setq finish t))
|
||||
|
||||
((equal (match-string 0) "\n")
|
||||
(set-match-data
|
||||
(list begin (match-beginning 0)
|
||||
begin (1+ begin)
|
||||
(1+ begin) (match-beginning 0)
|
||||
nil nil))
|
||||
(setq finish t))))
|
||||
(unless finish
|
||||
;; string closed by end of buffer
|
||||
(set-match-data
|
||||
(list begin (point)
|
||||
begin (1+ begin)
|
||||
(1+ begin) (point)
|
||||
nil nil))))))
|
||||
;; there was a match
|
||||
t))
|
||||
|
||||
(defun haskell-lexeme-looking-at-quasi-quote-literal ()
|
||||
"Non-nil when point is just in front of Template Haskell
|
||||
quaisquote literal.
|
||||
|
||||
Quasi quotes start with '[xxx|' or '[$xxx|' sequence and end with
|
||||
'|]'. The 'xxx' is a quoter name. There is no escaping mechanism
|
||||
provided for the ending sequence.
|
||||
|
||||
Regexp has subgroup expressions:
|
||||
(match-text 1) matches the quoter name (without $ sign if present).
|
||||
(match-text 2) matches the opening vertical bar.
|
||||
(match-text 3) matches the inside of the quoted string.
|
||||
(match-text 4) matches the closing vertical bar
|
||||
or nil if at the end of the buffer.
|
||||
|
||||
Note that this function excludes 'e', 't', 'd', 'p' as quoter
|
||||
names according to Template Haskell specification."
|
||||
(let ((match-data-old (match-data)))
|
||||
(if (and
|
||||
(looking-at (rx-to-string `(: "[" (optional "$")
|
||||
(group (regexp ,haskell-lexeme-id))
|
||||
(group "|"))))
|
||||
(equal (haskell-lexeme-classify-by-first-char (char-after (match-beginning 1)))
|
||||
'varid)
|
||||
(not (member (match-string 1) '("e" "t" "d" "p"))))
|
||||
(save-excursion
|
||||
;; note that quasi quote syntax does not have any escaping
|
||||
;; mechanism and if not closed it will span til lthe end of buffer
|
||||
(goto-char (match-end 0))
|
||||
(let ((match-data (match-data))
|
||||
(match-data-2 (and (re-search-forward "|]" nil t)
|
||||
(match-data))))
|
||||
(if match-data-2
|
||||
(set-match-data
|
||||
(list
|
||||
(nth 0 match-data) (nth 1 match-data-2) ;; whole match
|
||||
(nth 2 match-data) (nth 3 match-data) ;; quoter name
|
||||
(nth 4 match-data) (nth 5 match-data) ;; opening bar
|
||||
(nth 5 match-data) (nth 0 match-data-2) ;; inner string
|
||||
(nth 0 match-data-2) (1+ (nth 0 match-data-2)))) ;; closing bar
|
||||
|
||||
(set-match-data
|
||||
(list
|
||||
(nth 0 match-data) (point-max) ;; whole match
|
||||
(nth 2 match-data) (nth 3 match-data) ;; quoter name
|
||||
(nth 4 match-data) (nth 5 match-data) ;; opening bar
|
||||
(nth 5 match-data) (point-max) ;; inner string
|
||||
nil nil)) ;; closing bar
|
||||
))
|
||||
t)
|
||||
;; restore old match data if not matched
|
||||
(set-match-data match-data-old)
|
||||
nil)))
|
||||
|
||||
(defun haskell-lexeme-classify-by-first-char (char)
|
||||
"Classify token by CHAR.
|
||||
|
||||
CHAR is a chararacter that is assumed to be the first character
|
||||
of a token."
|
||||
(let ((category (get-char-code-property (or char ?\ ) 'general-category)))
|
||||
|
||||
(cond
|
||||
((or (member char '(?! ?# ?$ ?% ?& ?* ?+ ?. ?/ ?< ?= ?> ?? ?@ ?^ ?| ?~ ?\\ ?-))
|
||||
(and (> char 127)
|
||||
(member category '(Pc Pd Po Sm Sc Sk So))))
|
||||
'varsym)
|
||||
((equal char ?:)
|
||||
'consym)
|
||||
((equal char ?\')
|
||||
'char)
|
||||
((equal char ?\")
|
||||
'string)
|
||||
((member category '(Lu Lt))
|
||||
'conid)
|
||||
((or (equal char ?_)
|
||||
(member category '(Ll Lo)))
|
||||
'varid)
|
||||
((and (>= char ?0) (<= char ?9))
|
||||
'number)
|
||||
((member char '(?\] ?\[ ?\( ?\) ?\{ ?\} ?\` ?\, ?\;))
|
||||
'special))))
|
||||
|
||||
(defun haskell-lexeme-looking-at-token (&rest flags)
|
||||
"Like `looking-at' but understands Haskell lexemes.
|
||||
|
||||
Moves point forward over whitespace. Returns a symbol describing
|
||||
type of Haskell token recognized. Use `match-string',
|
||||
`match-beginning' and `match-end' with argument 0 to query match
|
||||
result.
|
||||
|
||||
Possible results are:
|
||||
- 'special: for chars [](){}`,;
|
||||
- 'comment: for single line comments
|
||||
- 'nested-comment: for multiline comments
|
||||
- 'qsymid: for qualified identifiers or symbols
|
||||
- 'string: for strings literals
|
||||
- 'char: for char literals
|
||||
- 'number: for decimal, float, hexadecimal and octal number literals
|
||||
- 'template-haskell-quote: for a string of apostrophes for template haskell
|
||||
- 'template-haskell-quasi-quote: for a string of apostrophes for template haskell
|
||||
|
||||
Note that for qualified symbols (match-string 1) returns the
|
||||
unqualified identifier or symbol. Further qualification for
|
||||
symbol or identifier can be done with:
|
||||
|
||||
(haskell-lexeme-classify-by-first-char (char-after (match-beginning 1)))
|
||||
|
||||
See `haskell-lexeme-classify-by-first-char' for details."
|
||||
(while
|
||||
;; Due to how unterminated strings terminate at newline, some
|
||||
;; newlines have syntax set to generic string delimeter. We want
|
||||
;; those to be treated as whitespace anyway
|
||||
(or
|
||||
(> (skip-syntax-forward "-") 0)
|
||||
(and (not (member 'newline flags))
|
||||
(> (skip-chars-forward "\n") 0))))
|
||||
(let
|
||||
((case-fold-search nil)
|
||||
(point (point-marker)))
|
||||
(or
|
||||
(and
|
||||
(equal (string-to-syntax "<")
|
||||
(get-char-property (point) 'syntax-table))
|
||||
(progn
|
||||
(set-match-data (list point (set-marker (make-marker) (line-end-position))))
|
||||
'literate-comment))
|
||||
(and (looking-at "\n")
|
||||
'newline)
|
||||
(and (looking-at "{-")
|
||||
(save-excursion
|
||||
(forward-comment 1)
|
||||
(set-match-data (list point (point-marker)))
|
||||
'nested-comment))
|
||||
(and (haskell-lexeme-looking-at-char-literal)
|
||||
'char)
|
||||
(and (haskell-lexeme-looking-at-string-literal)
|
||||
'string)
|
||||
(and (looking-at "[][(){}`,;]")
|
||||
(if (haskell-lexeme-looking-at-quasi-quote-literal)
|
||||
'template-haskell-quasi-quote
|
||||
'special))
|
||||
(and (haskell-lexeme-looking-at-qidsym)
|
||||
(if (save-match-data
|
||||
(string-match "\\`---*\\'" (match-string-no-properties 0)))
|
||||
(progn
|
||||
(set-match-data (list point (set-marker (make-marker) (line-end-position))))
|
||||
'comment)
|
||||
'qsymid))
|
||||
(and (looking-at haskell-lexeme-number)
|
||||
'number)
|
||||
(and (looking-at "'+")
|
||||
'template-haskell-quote)
|
||||
(and (looking-at ".")
|
||||
'illegal))))
|
||||
|
||||
(provide 'haskell-lexeme)
|
||||
|
||||
;;; haskell-lexeme.el ends here
|
||||
Reference in New Issue
Block a user