From 93895b8a01535ee70ccc4d5e3189862a3cae61a2 Mon Sep 17 00:00:00 2001
From: marihachi <marihachi0620@gmail.com>
Date: Thu, 6 Jan 2022 01:05:37 +0900
Subject: [PATCH] Introduce nesting limit (#87)

* support fnDepthLimit

* fix parse option

* rename fnDepthLimit to nestLimit

* support limit nesting: big, bold, small, italic, strike

* improve mention

* fix hashtag

* support limit nesting: hashtag

* support limit nesting: url

* refine link label

* refactor

* fix link

* change default value of nestLimit

* fix link label

* add test

* restore and modify test
---
 etc/mfm-js.api.md         |   1 +
 src/api.ts                |   8 ++-
 src/internal/parser.pegjs | 147 ++++++++++++++++++++++----------------
 test/parser.ts            | 144 ++++++++++++++++++++++++++++++++++++-
 4 files changed, 235 insertions(+), 65 deletions(-)

diff --git a/etc/mfm-js.api.md b/etc/mfm-js.api.md
index d4e14c3..ea4f238 100644
--- a/etc/mfm-js.api.md
+++ b/etc/mfm-js.api.md
@@ -233,6 +233,7 @@ export type NodeType<T extends MfmNode['type']> = T extends 'quote' ? MfmQuote :
 // @public (undocumented)
 export function parse(input: string, opts?: Partial<{
     fnNameList: string[];
+    nestLimit: number;
 }>): MfmNode[];
 
 // Warning: (ae-forgotten-export) The symbol "MfmPlainNode" needs to be exported by the entry point index.d.ts
diff --git a/src/api.ts b/src/api.ts
index 4124061..eaf2b3b 100644
--- a/src/api.ts
+++ b/src/api.ts
@@ -8,8 +8,12 @@ const parser: peg.Parser = require('./internal/parser');
 /**
  * Generates a MfmNode tree from the MFM string.
 */
-export function parse(input: string, opts: Partial<{ fnNameList: string[]; }> = {}): MfmNode[] {
-	const nodes = parser.parse(input, { startRule: 'fullParser', fnNameList: opts.fnNameList });
+export function parse(input: string, opts: Partial<{ fnNameList: string[]; nestLimit: number; }> = {}): MfmNode[] {
+	const nodes = parser.parse(input, {
+		startRule: 'fullParser',
+		fnNameList: opts.fnNameList,
+		nestLimit: opts.nestLimit
+	});
 	return nodes;
 }
 
diff --git a/src/internal/parser.pegjs b/src/internal/parser.pegjs
index 76cc315..715e02c 100644
--- a/src/internal/parser.pegjs
+++ b/src/internal/parser.pegjs
@@ -64,6 +64,28 @@
 		}
 		return options.fnNameList.includes(name);
 	}
+
+	// nesting control
+
+	const nestLimit = options.nestLimit || 20;
+	let depth = 0;
+	function enterNest() {
+		if (depth + 1 > nestLimit) {
+			return false;
+		}
+		depth++;
+		return true;
+	}
+
+	function leaveNest() {
+		depth--;
+		return true;
+	}
+
+	function fallbackNest() {
+		depth--;
+		return false;
+	}
 }
 
 //
@@ -230,19 +252,22 @@ unicodeEmoji
 // inline: big
 
 big
-	= "***" content:(!"***" @inline)+ "***"
+	= "***" content:bigContent "***"
 {
 	return FN('tada', { }, mergeText(content));
 }
 
+bigContent
+	= &{ return enterNest(); } @(@(!"***" @inline)+ &{ return leaveNest(); } / &{ return fallbackNest(); })
+
 // inline: bold
 
 bold
-	= "**" content:(!"**" @inline)+ "**"
+	= "**" content:boldContent "**"
 {
 	return BOLD(mergeText(content));
 }
-	/ "<b>" content:(!"</b>" @inline)+ "</b>"
+	/ "<b>" content:boldTagContent "</b>"
 {
 	return BOLD(mergeText(content));
 }
@@ -252,25 +277,31 @@ bold
 	return BOLD(parsedContent);
 }
 
+boldContent
+	= &{ return enterNest(); } @(@(!"**" @inline)+ &{ return leaveNest(); } / &{ return fallbackNest(); })
+
+boldTagContent
+	= &{ return enterNest(); } @(@(!"</b>" @inline)+ &{ return leaveNest(); } / &{ return fallbackNest(); })
+
 // inline: small
 
 small
-	= "<small>" content:(!"</small>" @inline)+ "</small>"
+	= "<small>" content:smallContent "</small>"
 {
 	return SMALL(mergeText(content));
 }
 
+smallContent
+	= &{ return enterNest(); } @(@(!"</small>" @inline)+ &{ return leaveNest(); } / &{ return fallbackNest(); })
+
 // inline: italic
 
 italic
-	= italicTag
-	/ italicAlt
-
-italicTag
-	= "<i>" content:(!"</i>" @inline)+ "</i>"
+	= "<i>" content:italicContent "</i>"
 {
 	return ITALIC(mergeText(content));
 }
+	/ italicAlt
 
 italicAlt
 	= "*" content:$(!"*" ([a-z0-9]i / _))+ "*" &(EOF / LF / _ / ![a-z0-9]i)
@@ -284,18 +315,27 @@ italicAlt
 	return ITALIC(parsedContent);
 }
 
+italicContent
+	= &{ return enterNest(); } @(@(!"</i>" @inline)+ &{ return leaveNest(); } / &{ return fallbackNest(); })
+
 // inline: strike
 
 strike
-	= "~~" content:(!("~" / LF) @inline)+ "~~"
+	= "~~" content:strikeContent "~~"
 {
 	return STRIKE(mergeText(content));
 }
-	/ "<s>" content:(!("</s>" / LF) @inline)+ "</s>"
+	/ "<s>" content:strikeTagContent "</s>"
 {
 	return STRIKE(mergeText(content));
 }
 
+strikeContent
+	= &{ return enterNest(); } @(@(!("~" / LF) @inline)+ &{ return leaveNest(); } / &{ return fallbackNest(); })
+
+strikeTagContent
+	= &{ return enterNest(); } @(@(!("</s>" / LF) @inline)+ &{ return leaveNest(); } / &{ return fallbackNest(); })
+
 // inline: inlineCode
 
 inlineCode
@@ -321,83 +361,59 @@ mention
 }
 
 mentionName
-	= !"-" mentionNamePart+ // first char is not "-".
+	= [a-z0-9_]i (&("-"+ [a-z0-9_]i) . / [a-z0-9_]i)*
 {
+	// NOTE: first char and last char are not "-".
 	return text();
 }
 
-mentionNamePart
-	= "-" &mentionNamePart // last char is not "-".
-	/ [a-z0-9_]i
-
 mentionHost
-	= ![.-] mentionHostPart+ // first char is neither "." nor "-".
+	= [a-z0-9_]i (&([.-]i+ [a-z0-9_]i) . / [a-z0-9_]i)*
 {
+	// NOTE: first char and last char are neither "." nor "-".
 	return text();
 }
 
-mentionHostPart
-	= [.-] &mentionHostPart // last char is neither "." nor "-".
-	/ [a-z0-9_]i
-
 // inline: hashtag
 
 hashtag
-	= "#" !("\uFE0F"? "\u20E3") content:hashtagContent
+	= "#" !("\uFE0F"? "\u20E3") !(invalidHashtagContent !hashtagContentPart) content:$hashtagContentPart+
 {
 	return HASHTAG(content);
 }
 
-hashtagContent
-	= !(invalidHashtagContent !hashtagContentPart) hashtagContentPart+ { return text(); }
-
 invalidHashtagContent
 	= [0-9]+
 
 hashtagContentPart
-	= hashtagBracketPair
-	/ hashtagChar
+	= "(" hashPairInner ")"
+	/ "[" hashPairInner "]"
+	/ "「" hashPairInner "」"
+	/ ![  \t.,!?'"#:\/\[\]【】()「」<>] CHAR
 
-hashtagBracketPair
-	= "(" hashtagContent* ")"
-	/ "[" hashtagContent* "]"
-	/ "「" hashtagContent* "」"
-
-hashtagChar
-	= ![  \t.,!?'"#:\/\[\]【】()「」<>] CHAR
+hashPairInner
+	= &{ return enterNest(); } @(@hashtagContentPart* &{ return leaveNest(); } / &{ return fallbackNest(); })
 
 // inline: URL
 
 url
-	= "<" url:altUrlFormat ">"
+	= "<" url:$("http" "s"? "://" (!(">" / _) CHAR)+) ">"
 {
 	return N_URL(url, true);
 }
-	/ url:urlFormat
-{
-	return N_URL(url);
-}
-
-urlFormat
-	= "http" "s"? "://" urlContentPart+
+	/ "http" "s"? "://" (&([.,]+ urlContentPart) . / urlContentPart)+
 {
-	return text();
+	// NOTE: last char is neither "." nor ",".
+	return N_URL(text());
 }
 
 urlContentPart
-	= urlBracketPair
-	/ [.,] &urlContentPart // last char is neither "." nor ",".
+	= "(" urlPairInner ")"
+	/ "[" urlPairInner "]"
 	/ [a-z0-9_/:%#@$&?!~=+-]i
 
-urlBracketPair
-	= "(" urlContentPart* ")"
-	/ "[" urlContentPart* "]"
-
-altUrlFormat
-	= "http" "s"? "://" (!(">" / _) CHAR)+
-{
-	return text();
-}
+urlPairInner
+	= &{ return enterNest(); } @(@(urlContentPart / [.,])* &{ return leaveNest(); } / &{ return fallbackNest(); })
 
 // inline: link
 
@@ -408,23 +424,34 @@ link
 }
 
 linkLabel
-	= linkLabelPart+
+	= (!"]" @linkLabelPart)+
 
 linkLabelPart
-	= url { return text(); /* text node */ }
-	/ link { return text(); /* text node */ }
-	/ mention { return text(); /* text node */ }
-	/ !"]" @inline
+	= emojiCode
+	/ unicodeEmoji
+	/ big
+	/ bold
+	/ small
+	/ italic
+	/ strike
+	/ inlineCode
+	/ mathInline
+	/ hashtag
+	/ fn
+	/ inlineText
 
 // inline: fn
 
 fn
-	= "$[" name:$([a-z0-9_]i)+ &{ return ensureFnName(name); } args:fnArgs? _ content:(!("]") @inline)+ "]"
+	= "$[" name:$([a-z0-9_]i)+ &{ return ensureFnName(name); } args:fnArgs? _ content:fnContent "]"
 {
 	args = args || {};
 	return FN(name, args, mergeText(content));
 }
 
+fnContent
+	= &{ return enterNest(); } @(@(!"]" @inline)+ &{ return leaveNest(); } / &{ return fallbackNest(); })
+
 fnArgs
 	= "." head:fnArg tails:("," @fnArg)*
 {
diff --git a/test/parser.ts b/test/parser.ts
index 11d00e2..bcc5757 100644
--- a/test/parser.ts
+++ b/test/parser.ts
@@ -984,14 +984,16 @@ hoge`;
 			assert.deepStrictEqual(mfm.parse(input), output);
 		});
 
-		it('do not yield link node even if label is recognisable as a link', () => {
+		it('cannot nest a link in a link label', () => {
 			const input = 'official instance: [[https://misskey.io/@ai](https://misskey.io/@ai)](https://misskey.io/@ai).';
 			const output = [
 				TEXT('official instance: '),
 				LINK(false, 'https://misskey.io/@ai', [
-					TEXT('[https://misskey.io/@ai](https://misskey.io/@ai)')
+					TEXT('[https://misskey.io/@ai')
 				]),
-				TEXT('.')
+				TEXT(']('),
+				N_URL('https://misskey.io/@ai'),
+				TEXT(').'),
 			];
 			assert.deepStrictEqual(mfm.parse(input), output);
 		});
@@ -1081,6 +1083,142 @@ hoge`;
 		});
 	});
 
+	describe('nesting limit', () => {
+		it('big', () => {
+			const input = '<b><b>***abc***</b></b>';
+			const output = [
+				BOLD([
+					BOLD([
+						TEXT('**'),
+						ITALIC([
+							TEXT('abc'),
+						]),
+						TEXT('**'),
+					]),
+				]),
+			];
+			assert.deepStrictEqual(mfm.parse(input, { nestLimit: 2 }), output);
+		});
+
+		describe('bold', () => {
+			it('basic', () => {
+				const input = '<i><i>**abc**</i></i>';
+				const output = [
+					ITALIC([
+						ITALIC([
+							TEXT('*'),
+							ITALIC([
+								TEXT('abc'),
+							]),
+							TEXT('*'),
+						]),
+					]),
+				];
+				assert.deepStrictEqual(mfm.parse(input, { nestLimit: 2 }), output);
+			});
+
+			it('tag', () => {
+				const input = '<i><i><b>abc</b></i></i>';
+				const output = [
+					ITALIC([
+						ITALIC([
+							TEXT('<b>abc</b>'),
+						]),
+					]),
+				];
+				assert.deepStrictEqual(mfm.parse(input, { nestLimit: 2 }), output);
+			});
+		});
+
+		it('small', () => {
+			const input = '<i><i><small>abc</small></i></i>';
+			const output = [
+				ITALIC([
+					ITALIC([
+						TEXT('<small>abc</small>'),
+					]),
+				]),
+			];
+			assert.deepStrictEqual(mfm.parse(input, { nestLimit: 2 }), output);
+		});
+
+		it('italic', () => {
+			const input = '<b><b><i>abc</i></b></b>';
+			const output = [
+				BOLD([
+					BOLD([
+						TEXT('<i>abc</i>'),
+					]),
+				]),
+			];
+			assert.deepStrictEqual(mfm.parse(input, { nestLimit: 2 }), output);
+		});
+
+		describe('strike', () => {
+			it('basic', () => {
+				const input = '<b><b>~~abc~~</b></b>';
+				const output = [
+					BOLD([
+						BOLD([
+							TEXT('~~abc~~'),
+						]),
+					]),
+				];
+				assert.deepStrictEqual(mfm.parse(input, { nestLimit: 2 }), output);
+			});
+	
+			it('tag', () => {
+				const input = '<b><b><s>abc</s></b></b>';
+				const output = [
+					BOLD([
+						BOLD([
+							TEXT('<s>abc</s>'),
+						]),
+					]),
+				];
+				assert.deepStrictEqual(mfm.parse(input, { nestLimit: 2 }), output);
+			});
+		});
+
+		it('hashtag', () => {
+			const input = '<b><b>#abc(xyz)</b></b>';
+			const output = [
+				BOLD([
+					BOLD([
+						HASHTAG('abc'),
+						TEXT('(xyz)'),
+					]),
+				]),
+			];
+			assert.deepStrictEqual(mfm.parse(input, { nestLimit: 2 }), output);
+		});
+
+		it('url', () => {
+			const input = '<b><b>https://example.com/abc(xyz)</b></b>';
+			const output = [
+				BOLD([
+					BOLD([
+						N_URL('https://example.com/abc'),
+						TEXT('(xyz)'),
+					]),
+				]),
+			];
+			assert.deepStrictEqual(mfm.parse(input, { nestLimit: 2 }), output);
+		});
+
+		it('fn', () => {
+			const input = '<b><b>$[a b]</b></b>';
+			const output = [
+				BOLD([
+					BOLD([
+						TEXT('$[a b]'),
+					]),
+				]),
+			];
+			assert.deepStrictEqual(mfm.parse(input, { nestLimit: 2 }), output);
+		});
+	});
+
 	it('composite', () => {
 		const input =
 `before
-- 
GitLab