@@ -9,13 +9,15 @@ public meta import Batteries.Data.String.Matcher
99public meta import Lake.Util.Casing
1010public import Batteries.Data.String.Basic
1111public import Mathlib.Data.Nat.Notation
12+ public meta import Mathlib.Tactic.Linter.TextBased.UnicodeLinter
1213
1314-- Don't warn about the lake import: the above file has almost no imports, and this PR has been
1415-- benchmarked.
1516set_option linter.style.header false
1617
1718meta section
1819
20+
1921/-!
2022## Text-based linters
2123
@@ -29,6 +31,7 @@ Currently, this file contains linters checking
2931- for module names to be in upper camel case,
3032- for module names to be valid Windows filenames, and containing no forbidden characters such as
3133 `!`, `.` or spaces.
34+ - for any code containing blocklisted unicode characters
3235
3336 For historic reasons, some further such checks are written in a Python script `lint-style.py`:
3437these are gradually being rewritten in Lean.
@@ -55,9 +58,9 @@ inductive StyleError where
5558 | trailingWhitespace
5659 /-- A line contains a space before a semicolon -/
5760 | semicolon
58- /-- A line contains a non-breaking space character -/
59- | nonbreakingSpace
60- deriving BEq
61+ /-- A unicode character was used that isn't allowed -/
62+ | unwantedUnicode (c : Char)
63+ deriving BEq, Inhabited
6164
6265/-- How to format style errors -/
6366public inductive ErrorFormat
@@ -79,7 +82,8 @@ def StyleError.errorMessage (err : StyleError) : String := match err with
7982 endings (\n ) instead"
8083 | trailingWhitespace => "This line ends with some whitespace: please remove this"
8184 | semicolon => "This line contains a space before a semicolon"
82- | nonbreakingSpace => "This line contains a non-breaking space character"
85+ | StyleError.unwantedUnicode c => s! "This line contains a bad unicode character \
86+ '{ c} ' ({ UnicodeLinter.printCodepointHex c} )."
8387
8488/-- The error code for a given style error. Keep this in sync with `parse?_errorContext` below! -/
8589-- FUTURE: we're matching the old codes in `lint-style.py` for compatibility;
@@ -89,7 +93,8 @@ def StyleError.errorCode (err : StyleError) : String := match err with
8993 | StyleError.windowsLineEnding => "ERR_WIN"
9094 | StyleError.trailingWhitespace => "ERR_TWS"
9195 | StyleError.semicolon => "ERR_SEM"
92- | StyleError.nonbreakingSpace => "ERR_NSP"
96+ | StyleError.unwantedUnicode _ => "ERR_UNICODE"
97+
9398
9499/-- Context for a style error: the actual error, the line number in the file we're reading
95100and the path to the file. -/
@@ -100,6 +105,7 @@ structure ErrorContext where
100105 lineNumber : ℕ
101106 /-- The path to the file which was linted -/
102107 path : FilePath
108+ deriving BEq
103109
104110/-- Possible results of comparing an `ErrorContext` to an `existing` entry:
105111most often, they are different --- if the existing entry covers the new exception,
@@ -152,11 +158,16 @@ def outputMessage (errctx : ErrorContext) (style : ErrorFormat) : String :=
152158 -- Print for humans: clickable file name and omit the error code
153159 s! "error: { errctx.path} :{ errctx.lineNumber} : { errorMessage} "
154160
155- /-- Try parsing an `ErrorContext` from a string: return `some` if successful, `none` otherwise. -/
161+ /-- Try parsing an `ErrorContext` from a string: return `some` if successful, `none` otherwise.
162+ Used for, e.g., parsing the "exceptions" file.
163+
164+ Need to ensure (see unit tests in `MathlibTest/LintStyle.lean`) that
165+ `∀ (ec : ErrorContext), (parse?_errorContext <| outputMessage ec .exceptionsFile) = some ec`
166+ -/
156167def parse?_errorContext (line : String) : Option ErrorContext := Id.run do
157168 let parts := line.splitToList (· == ' ' )
158169 match parts with
159- | filename :: ":" :: "line" :: lineNumber :: ":" :: errorCode :: ":" :: _errorMessage =>
170+ | filename :: ":" :: "line" :: lineNumber :: ":" :: errorCode :: ":" :: errorMessage =>
160171 -- Turn the filename into a path. In general, this is ambiguous if we don't know if we're
161172 -- dealing with e.g. Windows or POSIX paths. In our setting, this is fine, since no path
162173 -- component contains any path separator.
@@ -170,7 +181,13 @@ def parse?_errorContext (line : String) : Option ErrorContext := Id.run do
170181 | "ERR_SEM" => some (StyleError.semicolon)
171182 | "ERR_TWS" => some (StyleError.trailingWhitespace)
172183 | "ERR_WIN" => some (StyleError.windowsLineEnding)
173- | "ERR_NSP" => some (StyleError.nonbreakingSpace)
184+ | "ERR_UNICODE" => do
185+ -- extract the offending unicode character from `errorMessage`
186+ -- (if the offending character is 'C', `errorMessage[7] == "'C'"` )
187+ -- and wrap it in the appropriate `StyleError`, which will print it as '+NNNN'
188+ let str ← errorMessage[7 ]?
189+ let c ← String.Pos.Raw.get? str ⟨1 ⟩ -- take middle character of expected three
190+ StyleError.unwantedUnicode c
174191 | _ => none
175192 match String.toNat? lineNumber with
176193 | some n => err.map fun e ↦ (ErrorContext.mk e n path)
@@ -194,7 +211,7 @@ def formatErrors (errors : Array ErrorContext) (style : ErrorFormat) : IO Unit :
194211 IO.println (outputMessage e style)
195212
196213/-- Core logic of a text based linter: given a collection of lines,
197- return an array of all style errors with line numbers. If possible,
214+ return an array of all style errors with (1-based!) line numbers. If possible,
198215also return the collection of all lines, changed as needed to fix the linter errors.
199216(Such automatic fixes are only possible for some kinds of `StyleError`s.)
200217-/
@@ -253,20 +270,6 @@ def semicolonLinter : TextbasedLinter := fun opts lines ↦ Id.run do
253270 fixedLines := fixedLines.set! idx (line.replace (String.ofList [' ' , ';' ]) ";" )
254271 return (errors, if errors.size > 0 then some fixedLines else none)
255272
256- /-- Lint a collection of input strings for a non-breaking space character. -/
257- public register_option linter.nonbreakingSpace : Bool := { defValue := true }
258-
259- @ [inherit_doc linter.nonbreakingSpace]
260- def nonbreakingSpaceLinter : TextbasedLinter := fun opts lines ↦ Id.run do
261- unless getLinterValue linter.nonbreakingSpace opts do return (#[], none)
262- let mut errors := Array.mkEmpty 0
263- for h : idx in [:lines.size] do
264- let line := lines[idx]
265- let pos := line.find (· == ' ' )
266- if pos != line.endPos then
267- errors := errors.push (StyleError.nonbreakingSpace, idx + 1 )
268- return (errors, none)
269-
270273/-- Whether a collection of lines consists *only* of imports, blank lines and single-line comments.
271274In practice, this means it's an imports-only file and exempt from almost all linting. -/
272275def isImportsOnlyFile (lines : Array String) : Bool :=
@@ -276,12 +279,70 @@ def isImportsOnlyFile (lines : Array String) : Bool :=
276279
277280end
278281
282+ namespace UnicodeLinter
283+
284+ /-- Creates `StyleError`s for bad usage of unicode characters. -/
285+ def findBadUnicodeAux (s : String) (pos : s.Pos)
286+ (err : Array StyleError := #[]) : Array StyleError :=
287+ if h : pos < s.endPos then
288+ let c := pos.get (show pos ≠ s.endPos from String.Pos.ne_of_lt h)
289+ let posₙ := pos.next (show pos ≠ s.endPos from String.Pos.ne_of_lt h)
290+ have : posₙ.remainingBytes < pos.remainingBytes :=
291+ (pos.lt_iff_remainingBytes_lt posₙ).mp pos.lt_next
292+ if ! isAllowedCharacter c then
293+ -- bad: character not allowed. Add StyleError and continue recursion.
294+ findBadUnicodeAux s posₙ (err.push (.unwantedUnicode c))
295+ else
296+ -- okay. Continue recursion.
297+ findBadUnicodeAux s posₙ err
298+ else
299+ err
300+ termination_by pos.remainingBytes
301+
302+ /-- Creates `StyleError`s for bad usage of unicode characters. -/
303+ @[inline]
304+ def findBadUnicode (s : String) : Array StyleError :=
305+ findBadUnicodeAux s s.startPos
306+
307+ end UnicodeLinter
308+
309+ /-- Lint a collection of input strings for disallowed unicode characters. -/
310+ public register_option linter.unicodeLinter : Bool := { defValue := true }
311+
312+ @ [inherit_doc linter.unicodeLinter]
313+ def unicodeLinter : TextbasedLinter := fun opts lines ↦ Id.run do
314+ unless getLinterValue linter.unicodeLinter opts do return (#[], none)
315+
316+ let mut changed : Array String := #[]
317+ let mut errors : Array (StyleError × ℕ) := Array.mkEmpty 0
318+ let mut lineNumber := 1 -- one-based line numbers!
319+ for line in lines do
320+ let err := UnicodeLinter.findBadUnicode line
321+
322+ -- try to auto-fix the style error
323+ let mut newLine := line
324+ for e in err.reverse do -- reversing is a cheap fix to prevent shifting indices
325+ match e with
326+ | .unwantedUnicode c =>
327+ if let some replacement := UnicodeLinter.replaceDisallowed c then
328+ newLine := newLine.replace c replacement
329+ else
330+ pure ()
331+ | _ => unreachable!
332+
333+ changed := changed.push newLine
334+ errors := errors.append (err.map (fun e => (e, lineNumber)))
335+ lineNumber := lineNumber + 1
336+ return (errors, if (changed == lines) then none else some changed)
337+
279338/-- All text-based linters registered in this file. -/
280339def allLinters : Array TextbasedLinter := #[
281- adaptationNoteLinter, semicolonLinter, trailingWhitespaceLinter, nonbreakingSpaceLinter
340+ adaptationNoteLinter,
341+ semicolonLinter,
342+ trailingWhitespaceLinter,
343+ unicodeLinter,
282344 ]
283345
284-
285346/-- Read a file and apply all text-based linters.
286347Return a list of all unexpected errors, and, if some errors could be fixed automatically,
287348the collection of all lines with every automatic fix applied.
@@ -310,13 +371,33 @@ def lintFile (opts : LinterOptions) (path : FilePath) (exceptions : Array ErrorC
310371 let mut changed := lines
311372
312373 for lint in allLinters do
313- let (err, changes) := lint opts changed
314- allOutput := allOutput.append (Array.map (fun (e, n) ↦ #[(ErrorContext.mk e n path)]) err)
315- -- TODO: auto-fixes do not take style exceptions into account
374+ let (new_errors, changes) := lint opts changed
316375 if let some c := changes then
317- changed := c
376+ -- apply linter's suggested changes only where no exceptions apply.
377+ -- Each changed line must correspond to line number of at least one error.
378+ if changed.size != c.size then
379+ throw <| IO.userError "linter's suggested changes must have same number of lines as input"
380+ -- For each line in `changed`,
381+ changed := Array.ofFn fun (lineIdx : Fin changed.size) ↦
382+ -- check if any exception applies:
383+ if new_errors.any fun (e, idx) ↦
384+ (idx - 1 == lineIdx) -- Subtract 1 since linter's line numbers are one-based
385+ ∧ (ErrorContext.find?_comparable ⟨e, lineIdx, path⟩ exceptions).isSome
386+ then
387+ c[lineIdx]! -- no exception applies. Assign linter's suggestion.
388+ else
389+ changed[lineIdx]! -- An least one exception applies. Ignore linter's suggested line.
390+ -- Note: to keep logic simple, changed lines where an exception applies are left alone,
391+ -- even if there are other suggested changes where no exception applies.
392+
393+ -- append ALL errors to the output. For this, exception filtering happens later below.
394+ allOutput := allOutput.append
395+ (Array.map (fun (e, n) ↦ #[(ErrorContext.mk e n path)]) new_errors)
396+ if changed != lines then
318397 changes_made := true
319- -- This list is not sorted: for github, this is fine.
398+ -- Note: we ASSUME that the linters' auto-fixes do not introduce new issues!
399+
400+ -- Filter exceptions. Note: This list is not sorted. For github, this is fine.
320401 errors := errors.append
321402 (allOutput.flatten.filter (fun e ↦ (e.find?_comparable exceptions).isNone))
322403 return (errors, if changes_made then some changed else none)
0 commit comments