Skip to content

Commit 831b8d0

Browse files
feat(Tactic/Linter): refactor non-breaking-space-linter into a unicode-linter and add auto-fixing (leanprover-community#34022)
Replace the existing non-breaking-space-style-linter with an improved unicode-style linter and implement automatic fixes. The blocklist could then be refined in future PRs. Everything flagged by this linter can be fixed fully automatically with `lake exe lint-style --fix` or by commenting `bot fix style` on the PR. Co-authored-by: Jon Eugster <eugster.jon@gmail.com>
1 parent 307c2bf commit 831b8d0

7 files changed

Lines changed: 211 additions & 33 deletions

File tree

Mathlib.lean

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6713,6 +6713,7 @@ public import Mathlib.Tactic.Linter.PrivateModule
67136713
public import Mathlib.Tactic.Linter.Style
67146714
public import Mathlib.Tactic.Linter.TacticDocumentation
67156715
public import Mathlib.Tactic.Linter.TextBased
6716+
public import Mathlib.Tactic.Linter.TextBased.UnicodeLinter
67166717
public import Mathlib.Tactic.Linter.UnusedInstancesInType
67176718
public import Mathlib.Tactic.Linter.UnusedTactic
67186719
public import Mathlib.Tactic.Linter.UnusedTacticExtension

Mathlib/Tactic.lean

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,7 @@ public import Mathlib.Tactic.Linter.PrivateModule
174174
public import Mathlib.Tactic.Linter.Style
175175
public import Mathlib.Tactic.Linter.TacticDocumentation
176176
public import Mathlib.Tactic.Linter.TextBased
177+
public import Mathlib.Tactic.Linter.TextBased.UnicodeLinter
177178
public import Mathlib.Tactic.Linter.UnusedInstancesInType
178179
public import Mathlib.Tactic.Linter.UnusedTactic
179180
public import Mathlib.Tactic.Linter.UnusedTacticExtension

Mathlib/Tactic/Linter/TextBased.lean

Lines changed: 111 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,15 @@ public meta import Batteries.Data.String.Matcher
99
public meta import Lake.Util.Casing
1010
public import Batteries.Data.String.Basic
1111
public import Mathlib.Data.Nat.Notation
12+
public meta import Mathlib.Tactic.Linter.TextBased.UnicodeLinter
1213

1314
-- Don't warn about the lake import: the above file has almost no imports, and this PR has been
1415
-- benchmarked.
1516
set_option linter.style.header false
1617

1718
meta section
1819

20+
1921
/-!
2022
## Text-based linters
2123
@@ -29,6 +31,7 @@ Currently, this file contains linters checking
2931
- for module names to be in upper camel case,
3032
- for module names to be valid Windows filenames, and containing no forbidden characters such as
3133
`!`, `.` or spaces.
34+
- for any code containing blocklisted unicode characters
3235
3336
For historic reasons, some further such checks are written in a Python script `lint-style.py`:
3437
these are gradually being rewritten in Lean.
@@ -55,9 +58,9 @@ inductive StyleError where
5558
| trailingWhitespace
5659
/-- A line contains a space before a semicolon -/
5760
| semicolon
58-
/-- A line contains a non-breaking space character -/
59-
| nonbreakingSpace
60-
deriving BEq
61+
/-- A unicode character was used that isn't allowed -/
62+
| unwantedUnicode (c : Char)
63+
deriving BEq, Inhabited
6164

6265
/-- How to format style errors -/
6366
public inductive ErrorFormat
@@ -79,7 +82,8 @@ def StyleError.errorMessage (err : StyleError) : String := match err with
7982
endings (\n) instead"
8083
| trailingWhitespace => "This line ends with some whitespace: please remove this"
8184
| semicolon => "This line contains a space before a semicolon"
82-
| nonbreakingSpace => "This line contains a non-breaking space character"
85+
| StyleError.unwantedUnicode c => s!"This line contains a bad unicode character \
86+
'{c}' ({UnicodeLinter.printCodepointHex c})."
8387

8488
/-- The error code for a given style error. Keep this in sync with `parse?_errorContext` below! -/
8589
-- FUTURE: we're matching the old codes in `lint-style.py` for compatibility;
@@ -89,7 +93,8 @@ def StyleError.errorCode (err : StyleError) : String := match err with
8993
| StyleError.windowsLineEnding => "ERR_WIN"
9094
| StyleError.trailingWhitespace => "ERR_TWS"
9195
| StyleError.semicolon => "ERR_SEM"
92-
| StyleError.nonbreakingSpace => "ERR_NSP"
96+
| StyleError.unwantedUnicode _ => "ERR_UNICODE"
97+
9398

9499
/-- Context for a style error: the actual error, the line number in the file we're reading
95100
and the path to the file. -/
@@ -100,6 +105,7 @@ structure ErrorContext where
100105
lineNumber : ℕ
101106
/-- The path to the file which was linted -/
102107
path : FilePath
108+
deriving BEq
103109

104110
/-- Possible results of comparing an `ErrorContext` to an `existing` entry:
105111
most often, they are different --- if the existing entry covers the new exception,
@@ -152,11 +158,16 @@ def outputMessage (errctx : ErrorContext) (style : ErrorFormat) : String :=
152158
-- Print for humans: clickable file name and omit the error code
153159
s!"error: {errctx.path}:{errctx.lineNumber}: {errorMessage}"
154160

155-
/-- Try parsing an `ErrorContext` from a string: return `some` if successful, `none` otherwise. -/
161+
/-- Try parsing an `ErrorContext` from a string: return `some` if successful, `none` otherwise.
162+
Used for, e.g., parsing the "exceptions" file.
163+
164+
Need to ensure (see unit tests in `MathlibTest/LintStyle.lean`) that
165+
`∀ (ec : ErrorContext), (parse?_errorContext <| outputMessage ec .exceptionsFile) = some ec`
166+
-/
156167
def parse?_errorContext (line : String) : Option ErrorContext := Id.run do
157168
let parts := line.splitToList (· == ' ')
158169
match parts with
159-
| filename :: ":" :: "line" :: lineNumber :: ":" :: errorCode :: ":" :: _errorMessage =>
170+
| filename :: ":" :: "line" :: lineNumber :: ":" :: errorCode :: ":" :: errorMessage =>
160171
-- Turn the filename into a path. In general, this is ambiguous if we don't know if we're
161172
-- dealing with e.g. Windows or POSIX paths. In our setting, this is fine, since no path
162173
-- component contains any path separator.
@@ -170,7 +181,13 @@ def parse?_errorContext (line : String) : Option ErrorContext := Id.run do
170181
| "ERR_SEM" => some (StyleError.semicolon)
171182
| "ERR_TWS" => some (StyleError.trailingWhitespace)
172183
| "ERR_WIN" => some (StyleError.windowsLineEnding)
173-
| "ERR_NSP" => some (StyleError.nonbreakingSpace)
184+
| "ERR_UNICODE" => do
185+
-- extract the offending unicode character from `errorMessage`
186+
-- (if the offending character is 'C', `errorMessage[7] == "'C'"` )
187+
-- and wrap it in the appropriate `StyleError`, which will print it as '+NNNN'
188+
let str ← errorMessage[7]?
189+
let c ← String.Pos.Raw.get? str ⟨1-- take middle character of expected three
190+
StyleError.unwantedUnicode c
174191
| _ => none
175192
match String.toNat? lineNumber with
176193
| some n => err.map fun e ↦ (ErrorContext.mk e n path)
@@ -194,7 +211,7 @@ def formatErrors (errors : Array ErrorContext) (style : ErrorFormat) : IO Unit :
194211
IO.println (outputMessage e style)
195212

196213
/-- Core logic of a text based linter: given a collection of lines,
197-
return an array of all style errors with line numbers. If possible,
214+
return an array of all style errors with (1-based!) line numbers. If possible,
198215
also return the collection of all lines, changed as needed to fix the linter errors.
199216
(Such automatic fixes are only possible for some kinds of `StyleError`s.)
200217
-/
@@ -253,20 +270,6 @@ def semicolonLinter : TextbasedLinter := fun opts lines ↦ Id.run do
253270
fixedLines := fixedLines.set! idx (line.replace (String.ofList [' ', ';']) ";")
254271
return (errors, if errors.size > 0 then some fixedLines else none)
255272

256-
/-- Lint a collection of input strings for a non-breaking space character. -/
257-
public register_option linter.nonbreakingSpace : Bool := { defValue := true }
258-
259-
@[inherit_doc linter.nonbreakingSpace]
260-
def nonbreakingSpaceLinter : TextbasedLinter := fun opts lines ↦ Id.run do
261-
unless getLinterValue linter.nonbreakingSpace opts do return (#[], none)
262-
let mut errors := Array.mkEmpty 0
263-
for h : idx in [:lines.size] do
264-
let line := lines[idx]
265-
let pos := line.find (· == ' ')
266-
if pos != line.endPos then
267-
errors := errors.push (StyleError.nonbreakingSpace, idx + 1)
268-
return (errors, none)
269-
270273
/-- Whether a collection of lines consists *only* of imports, blank lines and single-line comments.
271274
In practice, this means it's an imports-only file and exempt from almost all linting. -/
272275
def isImportsOnlyFile (lines : Array String) : Bool :=
@@ -276,12 +279,70 @@ def isImportsOnlyFile (lines : Array String) : Bool :=
276279

277280
end
278281

282+
namespace UnicodeLinter
283+
284+
/-- Creates `StyleError`s for bad usage of unicode characters. -/
285+
def findBadUnicodeAux (s : String) (pos : s.Pos)
286+
(err : Array StyleError := #[]) : Array StyleError :=
287+
if h : pos < s.endPos then
288+
let c := pos.get (show pos ≠ s.endPos from String.Pos.ne_of_lt h)
289+
let posₙ := pos.next (show pos ≠ s.endPos from String.Pos.ne_of_lt h)
290+
have : posₙ.remainingBytes < pos.remainingBytes :=
291+
(pos.lt_iff_remainingBytes_lt posₙ).mp pos.lt_next
292+
if ! isAllowedCharacter c then
293+
-- bad: character not allowed. Add StyleError and continue recursion.
294+
findBadUnicodeAux s posₙ (err.push (.unwantedUnicode c))
295+
else
296+
-- okay. Continue recursion.
297+
findBadUnicodeAux s posₙ err
298+
else
299+
err
300+
termination_by pos.remainingBytes
301+
302+
/-- Creates `StyleError`s for bad usage of unicode characters. -/
303+
@[inline]
304+
def findBadUnicode (s : String) : Array StyleError :=
305+
findBadUnicodeAux s s.startPos
306+
307+
end UnicodeLinter
308+
309+
/-- Lint a collection of input strings for disallowed unicode characters. -/
310+
public register_option linter.unicodeLinter : Bool := { defValue := true }
311+
312+
@[inherit_doc linter.unicodeLinter]
313+
def unicodeLinter : TextbasedLinter := fun opts lines ↦ Id.run do
314+
unless getLinterValue linter.unicodeLinter opts do return (#[], none)
315+
316+
let mut changed : Array String := #[]
317+
let mut errors : Array (StyleError × ℕ) := Array.mkEmpty 0
318+
let mut lineNumber := 1 -- one-based line numbers!
319+
for line in lines do
320+
let err := UnicodeLinter.findBadUnicode line
321+
322+
-- try to auto-fix the style error
323+
let mut newLine := line
324+
for e in err.reverse do -- reversing is a cheap fix to prevent shifting indices
325+
match e with
326+
| .unwantedUnicode c =>
327+
if let some replacement := UnicodeLinter.replaceDisallowed c then
328+
newLine := newLine.replace c replacement
329+
else
330+
pure ()
331+
| _ => unreachable!
332+
333+
changed := changed.push newLine
334+
errors := errors.append (err.map (fun e => (e, lineNumber)))
335+
lineNumber := lineNumber + 1
336+
return (errors, if (changed == lines) then none else some changed)
337+
279338
/-- All text-based linters registered in this file. -/
280339
def allLinters : Array TextbasedLinter := #[
281-
adaptationNoteLinter, semicolonLinter, trailingWhitespaceLinter, nonbreakingSpaceLinter
340+
adaptationNoteLinter,
341+
semicolonLinter,
342+
trailingWhitespaceLinter,
343+
unicodeLinter,
282344
]
283345

284-
285346
/-- Read a file and apply all text-based linters.
286347
Return a list of all unexpected errors, and, if some errors could be fixed automatically,
287348
the collection of all lines with every automatic fix applied.
@@ -310,13 +371,33 @@ def lintFile (opts : LinterOptions) (path : FilePath) (exceptions : Array ErrorC
310371
let mut changed := lines
311372

312373
for lint in allLinters do
313-
let (err, changes) := lint opts changed
314-
allOutput := allOutput.append (Array.map (fun (e, n) ↦ #[(ErrorContext.mk e n path)]) err)
315-
-- TODO: auto-fixes do not take style exceptions into account
374+
let (new_errors, changes) := lint opts changed
316375
if let some c := changes then
317-
changed := c
376+
-- apply linter's suggested changes only where no exceptions apply.
377+
-- Each changed line must correspond to line number of at least one error.
378+
if changed.size != c.size then
379+
throw <| IO.userError "linter's suggested changes must have same number of lines as input"
380+
-- For each line in `changed`,
381+
changed := Array.ofFn fun (lineIdx : Fin changed.size) ↦
382+
-- check if any exception applies:
383+
if new_errors.any fun (e, idx) ↦
384+
(idx - 1 == lineIdx) -- Subtract 1 since linter's line numbers are one-based
385+
∧ (ErrorContext.find?_comparable ⟨e, lineIdx, path⟩ exceptions).isSome
386+
then
387+
c[lineIdx]! -- no exception applies. Assign linter's suggestion.
388+
else
389+
changed[lineIdx]! -- An least one exception applies. Ignore linter's suggested line.
390+
-- Note: to keep logic simple, changed lines where an exception applies are left alone,
391+
-- even if there are other suggested changes where no exception applies.
392+
393+
-- append ALL errors to the output. For this, exception filtering happens later below.
394+
allOutput := allOutput.append
395+
(Array.map (fun (e, n) ↦ #[(ErrorContext.mk e n path)]) new_errors)
396+
if changed != lines then
318397
changes_made := true
319-
-- This list is not sorted: for github, this is fine.
398+
-- Note: we ASSUME that the linters' auto-fixes do not introduce new issues!
399+
400+
-- Filter exceptions. Note: This list is not sorted. For github, this is fine.
320401
errors := errors.append
321402
(allOutput.flatten.filter (fun e ↦ (e.find?_comparable exceptions).isNone))
322403
return (errors, if changes_made then some changed else none)
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
/-
2+
Copyright (c) 2024 Jon Eugster. All rights reserved.
3+
Released under Apache 2.0 license as described in the file LICENSE.
4+
Authors: Adomas Baliuka, Jon Eugster
5+
-/
6+
module
7+
8+
import Mathlib.Init
9+
10+
11+
/-!
12+
13+
# Tools for the unicode Linter
14+
15+
The actual linter is defined in `TextBased.lean`.
16+
17+
This file defines the blocklist and other tools used by the linter.
18+
19+
-/
20+
21+
namespace Mathlib.Linter.TextBased.UnicodeLinter
22+
23+
/-- Prints a unicode character's codepoint in hexadecimal with prefix 'U+'.
24+
E.g., 'a' is "U+0061". -/
25+
public def printCodepointHex (c : Char) : String :=
26+
let digits := Nat.toDigits 16 c.val.toNat
27+
-- print at least 4 (could be more) hex characters using leading zeros
28+
("U+".append <| "0000".drop digits.length |>.toString).append <| String.ofList digits
29+
30+
/-- Blocklist: If `false`, the character is not allowed in Mathlib. -/
31+
public def isAllowedCharacter (c : Char) : Bool :=
32+
!#['\u00A0'].contains c -- non-breaking space
33+
34+
/-- Provide default replacement (`String`) for a blocklisted character, or `none` if none defined -/
35+
public def replaceDisallowed : Char → Option String
36+
| '\u00a0' => " " -- replace non-breaking space with normal whitespace
37+
| _ => none
38+
39+
end Mathlib.Linter.TextBased.UnicodeLinter

Mathlib/Tactic/Linter/UnusedTactic.lean

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ abbrev M := StateRefT (Std.HashMap Lean.Syntax.Range Syntax) IO
8585
Lean.Parser.Tactic.failIfSuccess
8686

8787
/--
88-
A list of blacklisted syntax kinds, which are expected to have subterms that contain
88+
A list of blocklisted syntax kinds, which are expected to have subterms that contain
8989
unevaluated tactics.
9090
-/
9191
initialize ignoreTacticKindsRef : IO.Ref NameHashSet ←

MathlibTest/LintStyle.lean

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
1+
module
2+
13
import Mathlib.Data.Nat.Basic
24
import Mathlib.Tactic.Linter.Style
35
import Mathlib.Order.SetNotation
6+
import Mathlib.Tactic.Basic
7+
import Mathlib.Tactic.Contrapose
8+
import all Mathlib.Tactic.Linter.TextBased
9+
import all Mathlib.Tactic.Linter.TextBased.UnicodeLinter
410

511
/-! Tests for all the style linters. -/
612

@@ -601,3 +607,55 @@ example := by
601607
rfl
602608

603609
end showLinter
610+
611+
/-! Tests for linters defined in `TextBased.lean`. -/
612+
section textBased
613+
section unicodeLinter
614+
615+
open Mathlib.Linter.TextBased
616+
open Mathlib.Linter.TextBased.UnicodeLinter
617+
618+
-- test parsing back error messages in `parse?_errorContext` for unicode errors
619+
620+
-- (`meta` because the whole section in `TextBased` is meta.)
621+
meta def ErrorContext.isValid_parse?_error_context (ec : ErrorContext) : Bool :=
622+
(parse?_errorContext <| outputMessage ec .exceptionsFile) == some ec
623+
624+
#guard ErrorContext.isValid_parse?_error_context {
625+
error := .adaptationNote,
626+
lineNumber := 1234, path:="Mathlib/Tactic/Measurability/Init.lean"}
627+
628+
#guard ErrorContext.isValid_parse?_error_context {
629+
error := .windowsLineEnding,
630+
lineNumber := 1234, path:="Mathlib/Tactic/Measurability/Init.lean"}
631+
632+
#guard ErrorContext.isValid_parse?_error_context {
633+
error := .trailingWhitespace,
634+
lineNumber := 1234, path:="Mathlib/Tactic/Measurability/Init.lean"}
635+
636+
#guard ErrorContext.isValid_parse?_error_context {
637+
error := .semicolon,
638+
lineNumber := 1234, path := "Mathlib/Tactic/Measurability/Init.lean"}
639+
640+
#guard ErrorContext.isValid_parse?_error_context {
641+
error := .unwantedUnicode '\u1234',
642+
lineNumber := 1234, path:="./MYFILE.lean"}
643+
644+
#guard ErrorContext.isValid_parse?_error_context {
645+
error := .unwantedUnicode '\u00a0',
646+
lineNumber := 22, path:="Mathlib/Tactic/Measurability/Init.lean"}
647+
648+
set_option linter.unusedTactic false in
649+
set_option linter.flexible false in
650+
/-- An error in this proof could mean that `replaceDisallowed` contains a character
651+
which is not dissallowed by `isAllowedCharacter`. -/
652+
private theorem disallowed_of_replaceable (c : Char) (creplaced : replaceDisallowed c ≠ none) :
653+
!isAllowedCharacter c := by
654+
contrapose creplaced
655+
simp [isAllowedCharacter, Array.contains] at creplaced
656+
repeat obtain ⟨_, creplaced⟩ := creplaced
657+
simp [replaceDisallowed]
658+
659+
end unicodeLinter
660+
661+
end textBased

scripts/nolints-style.txt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33

44
-- The linter for the string "adaptation note" fires in the implementation of the linter,
55
-- and in the implementation of the #adaptation_note tactic: this is as expected.
6-
Mathlib/Tactic/AdaptationNote.lean : line 9 : ERR_ADN : Found the string "Adaptation note:", please use the #adaptation_note command instead
76
Mathlib/Tactic/AdaptationNote.lean : line 12 : ERR_ADN : Found the string "Adaptation note:", please use the #adaptation_note command instead
87
Mathlib/Tactic/AdaptationNote.lean : line 21 : ERR_ADN : Found the string "Adaptation note:", please use the #adaptation_note command instead
98
Mathlib/Tactic/AdaptationNote.lean : line 27 : ERR_ADN : Found the string "Adaptation note:", please use the #adaptation_note command instead
@@ -15,4 +14,3 @@ Mathlib/Tactic/Linter/TextBased.lean : line 84 : ERR_ADN : Found the string "Ada
1514
Mathlib/Tactic/Linter/TextBased.lean : line 274 : ERR_ADN : Found the string "Adaptation note:", please use the #adaptation_note command instead
1615
Mathlib/Tactic/Linter/TextBased.lean : line 279 : ERR_ADN : Found the string "Adaptation note:", please use the #adaptation_note command instead
1716
Mathlib/Tactic/Linter/TextBased.lean : line 280 : ERR_ADN : Found the string "Adaptation note:", please use the #adaptation_note command instead
18-
Mathlib/Tactic/Linter/TextBased.lean : line 266 : ERR_NSP : This line contains a non-breaking space character

0 commit comments

Comments
 (0)