From f5359d4656772bca0c3b10eda259cbe8ca6654d0 Mon Sep 17 00:00:00 2001
From: dakkar <dakkar@thenautilus.net>
Date: Sat, 9 Mar 2024 09:39:48 +0000
Subject: [PATCH] normalise emoji text

also, tests
---
 src/internal/parser.ts |  2 +-
 test/parser.ts         | 24 ++++++++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/src/internal/parser.ts b/src/internal/parser.ts
index ef3264d..a31109b 100644
--- a/src/internal/parser.ts
+++ b/src/internal/parser.ts
@@ -629,7 +629,7 @@ export const language = P.createLanguage({
 			P.regexp(/[\p{Letter}\p{Number}\p{Mark}_+-]+/iu),
 			mark,
 			P.alt([P.lineEnd, side]),
-		], 2).map(name => M.EMOJI_CODE(name as string));
+		], 2).map(name => M.EMOJI_CODE((name as string).normalize('NFC')));
 	},
 
 	link: r => {
diff --git a/test/parser.ts b/test/parser.ts
index d3000df..1bad6c8 100644
--- a/test/parser.ts
+++ b/test/parser.ts
@@ -338,6 +338,30 @@ hoge`;
 			const output = [EMOJI_CODE('abc')];
 			assert.deepStrictEqual(mfm.parse(input), output);
 		});
+
+		test('non-ASCII', () => {
+			const input = ':taneŝima_ĝojas:, :मार्जारः:, :鹅:, :taneŝima_malsanas:, :แมว:, and :लक्षणा:';
+			const output = [
+				EMOJI_CODE('taneŝima_ĝojas'),
+				TEXT(', '),
+				EMOJI_CODE('मार्जारः'),
+				TEXT(', '),
+				EMOJI_CODE('é¹…'),
+				TEXT(', '),
+				EMOJI_CODE('taneŝima_malsanas'),
+				TEXT(', '),
+				EMOJI_CODE('แมว'),
+				TEXT(', and '),
+				EMOJI_CODE('लक्षणा'),
+			];
+			assert.deepStrictEqual(mfm.parse(input), output);
+		});
+
+		test('non-ASCII normalization', () => {
+			const input = ":fo\u{0308}o:";
+			const output = [EMOJI_CODE("f\u{00F6}o")];
+			assert.deepStrictEqual(mfm.parse(input), output);
+		});
 	});
 
 	describe('unicode emoji', () => {
-- 
GitLab