From b96942f6ec1c91a4531e7ecf845abdba0e5285f6 Mon Sep 17 00:00:00 2001 From: Alex Lorenz Date: Wed, 13 May 2015 23:10:51 +0000 Subject: [PATCH] YAML: Implement block scalar parsing. This commit implements the parsing of YAML block scalars. Some code existed for it before, but it couldn't parse block scalars. This commit adds a new yaml node type to represent the block scalar values. This commit also deletes the 'spec-09-27' and 'spec-09-28' tests as they are identical to the test file 'spec-09-26'. This commit introduces 3 new utility functions to the YAML scanner class: `skip_s_space`, `advanceWhile` and `consumeLineBreakIfPresent`. Reviewers: Duncan P. N. Exon Smith Differential Revision: http://reviews.llvm.org/D9503 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@237314 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Support/YAMLParser.h | 31 +++ lib/Support/YAMLParser.cpp | 286 +++++++++++++++++++++++++-- test/YAMLParser/spec-09-14.test | 7 +- test/YAMLParser/spec-09-18.test | 6 +- test/YAMLParser/spec-09-19.test | 4 +- test/YAMLParser/spec-09-20.test | 6 +- test/YAMLParser/spec-09-21.test | 2 +- test/YAMLParser/spec-09-22.test | 14 +- test/YAMLParser/spec-09-24.test | 9 +- test/YAMLParser/spec-09-25.test | 3 +- test/YAMLParser/spec-09-26.test | 3 +- test/YAMLParser/spec-09-27.test | 10 - test/YAMLParser/spec-09-28.test | 10 - unittests/Support/YAMLParserTest.cpp | 27 +++ utils/yaml-bench/YAMLBench.cpp | 2 + 15 files changed, 362 insertions(+), 58 deletions(-) delete mode 100644 test/YAMLParser/spec-09-27.test delete mode 100644 test/YAMLParser/spec-09-28.test diff --git a/include/llvm/Support/YAMLParser.h b/include/llvm/Support/YAMLParser.h index 37becaa7b6d..f0b6113e836 100644 --- a/include/llvm/Support/YAMLParser.h +++ b/include/llvm/Support/YAMLParser.h @@ -107,6 +107,7 @@ public: enum NodeKind { NK_Null, NK_Scalar, + NK_BlockScalar, NK_KeyValue, NK_Mapping, NK_Sequence, @@ -222,6 +223,36 @@ private: SmallVectorImpl &Storage) const; }; +/// \brief A block scalar node is an opaque datum that can be presented as a +/// series of zero or more Unicode scalar values. +/// +/// Example: +/// | +/// Hello +/// World +class BlockScalarNode : public Node { + void anchor() override; + +public: + BlockScalarNode(std::unique_ptr &D, StringRef Anchor, StringRef Tag, + std::string &Value, StringRef RawVal) + : Node(NK_BlockScalar, D, Anchor, Tag), Value(std::move(Value)) { + SMLoc Start = SMLoc::getFromPointer(RawVal.begin()); + SMLoc End = SMLoc::getFromPointer(RawVal.end()); + SourceRange = SMRange(Start, End); + } + + /// \brief Gets the value of this node as a StringRef. + StringRef getValue() const { return Value; } + + static inline bool classof(const Node *N) { + return N->getType() == NK_BlockScalar; + } + +private: + std::string Value; +}; + /// \brief A key and value pair. While not technically a Node under the YAML /// representation graph, it is easier to treat them this way. /// diff --git a/lib/Support/YAMLParser.cpp b/lib/Support/YAMLParser.cpp index 83c96510846..be9ba00bbee 100644 --- a/lib/Support/YAMLParser.cpp +++ b/lib/Support/YAMLParser.cpp @@ -101,6 +101,7 @@ namespace yaml { void Node::anchor() {} void NullNode::anchor() {} void ScalarNode::anchor() {} +void BlockScalarNode::anchor() {} void KeyValueNode::anchor() {} void MappingNode::anchor() {} void SequenceNode::anchor() {} @@ -128,6 +129,7 @@ struct Token : ilist_node { TK_Key, TK_Value, TK_Scalar, + TK_BlockScalar, TK_Alias, TK_Anchor, TK_Tag @@ -137,6 +139,9 @@ struct Token : ilist_node { /// of the token in the input. StringRef Range; + /// The value of a block scalar node. + std::string Value; + Token() : Kind(TK_Error) {} }; } @@ -348,6 +353,14 @@ private: /// b-break. StringRef::iterator skip_b_break(StringRef::iterator Position); + /// Skip a single s-space[31] starting at Position. + /// + /// An s-space is 0x20 + /// + /// @returns The code unit after the s-space, or Position if it's not a + /// s-space. + StringRef::iterator skip_s_space(StringRef::iterator Position); + /// @brief Skip a single s-white[33] starting at Position. /// /// A s-white is 0x20 | 0x9 @@ -373,6 +386,10 @@ private: StringRef::iterator skip_while( SkipWhileFunc Func , StringRef::iterator Position); + /// Skip minimal well-formed code unit subsequences until Func returns its + /// input. + void advanceWhile(SkipWhileFunc Func); + /// @brief Scan ns-uri-char[39]s starting at Cur. /// /// This updates Cur and Column while scanning. @@ -393,6 +410,11 @@ private: /// Pos is whitespace or a new line bool isBlankOrBreak(StringRef::iterator Position); + /// Consume a single b-break[28] if it's present at the current position. + /// + /// Return false if the code unit at the current position isn't a line break. + bool consumeLineBreakIfPresent(); + /// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey. void saveSimpleKeyCandidate( TokenQueueT::iterator Tok , unsigned AtColumn @@ -466,6 +488,30 @@ private: /// @brief Scan a block scalar starting with | or >. bool scanBlockScalar(bool IsLiteral); + /// Scan a chomping indicator in a block scalar header. + char scanBlockChompingIndicator(); + + /// Scan an indentation indicator in a block scalar header. + unsigned scanBlockIndentationIndicator(); + + /// Scan a block scalar header. + /// + /// Return false if an error occurred. + bool scanBlockScalarHeader(char &ChompingIndicator, unsigned &IndentIndicator, + bool &IsDone); + + /// Look for the indentation level of a block scalar. + /// + /// Return false if an error occurred. + bool findBlockScalarIndent(unsigned &BlockIndent, unsigned BlockExitIndent, + unsigned &LineBreaks, bool &IsDone); + + /// Scan the indentation of a text line in a block scalar. + /// + /// Return false if an error occurred. + bool scanBlockScalarIndent(unsigned BlockIndent, unsigned BlockExitIndent, + bool &IsDone); + /// @brief Scan a tag of the form !stuff. bool scanTag(); @@ -612,6 +658,9 @@ bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) { case Token::TK_Scalar: OS << "Scalar: "; break; + case Token::TK_BlockScalar: + OS << "Block Scalar: "; + break; case Token::TK_Alias: OS << "Alias: "; break; @@ -816,6 +865,13 @@ StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) { return Position; } +StringRef::iterator Scanner::skip_s_space(StringRef::iterator Position) { + if (Position == End) + return Position; + if (*Position == ' ') + return Position + 1; + return Position; +} StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) { if (Position == End) @@ -844,6 +900,12 @@ StringRef::iterator Scanner::skip_while( SkipWhileFunc Func return Position; } +void Scanner::advanceWhile(SkipWhileFunc Func) { + auto Final = skip_while(Func, Current); + Column += Final - Current; + Current = Final; +} + static bool is_ns_hex_digit(const char C) { return (C >= '0' && C <= '9') || (C >= 'a' && C <= 'z') @@ -906,6 +968,16 @@ bool Scanner::isBlankOrBreak(StringRef::iterator Position) { return false; } +bool Scanner::consumeLineBreakIfPresent() { + auto Next = skip_b_break(Current); + if (Next == Current) + return false; + Column = 0; + ++Line; + Current = Next; + return true; +} + void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok , unsigned AtColumn , bool IsRequired) { @@ -1374,38 +1446,204 @@ bool Scanner::scanAliasOrAnchor(bool IsAlias) { return true; } -bool Scanner::scanBlockScalar(bool IsLiteral) { - StringRef::iterator Start = Current; - skip(1); // Eat | or > - while(true) { - StringRef::iterator i = skip_nb_char(Current); - if (i == Current) { - if (Column == 0) - break; - i = skip_b_break(Current); - if (i != Current) { - // We got a line break. - Column = 0; - ++Line; - Current = i; - continue; - } else { - // There was an error, which should already have been printed out. +char Scanner::scanBlockChompingIndicator() { + char Indicator = ' '; + if (Current != End && (*Current == '+' || *Current == '-')) { + Indicator = *Current; + skip(1); + } + return Indicator; +} + +/// Get the number of line breaks after chomping. +/// +/// Return the number of trailing line breaks to emit, depending on +/// \p ChompingIndicator. +static unsigned getChompedLineBreaks(char ChompingIndicator, + unsigned LineBreaks, StringRef Str) { + if (ChompingIndicator == '-') // Strip all line breaks. + return 0; + if (ChompingIndicator == '+') // Keep all line breaks. + return LineBreaks; + // Clip trailing lines. + return Str.empty() ? 0 : 1; +} + +unsigned Scanner::scanBlockIndentationIndicator() { + unsigned Indent = 0; + if (Current != End && (*Current >= '1' && *Current <= '9')) { + Indent = unsigned(*Current - '0'); + skip(1); + } + return Indent; +} + +bool Scanner::scanBlockScalarHeader(char &ChompingIndicator, + unsigned &IndentIndicator, bool &IsDone) { + auto Start = Current; + + ChompingIndicator = scanBlockChompingIndicator(); + IndentIndicator = scanBlockIndentationIndicator(); + // Check for the chomping indicator once again. + if (ChompingIndicator == ' ') + ChompingIndicator = scanBlockChompingIndicator(); + Current = skip_while(&Scanner::skip_s_white, Current); + skipComment(); + + if (Current == End) { // EOF, we have an empty scalar. + Token T; + T.Kind = Token::TK_BlockScalar; + T.Range = StringRef(Start, Current - Start); + TokenQueue.push_back(T); + IsDone = true; + return true; + } + + if (!consumeLineBreakIfPresent()) { + setError("Expected a line break after block scalar header", Current); + return false; + } + return true; +} + +bool Scanner::findBlockScalarIndent(unsigned &BlockIndent, + unsigned BlockExitIndent, + unsigned &LineBreaks, bool &IsDone) { + unsigned MaxAllSpaceLineCharacters = 0; + StringRef::iterator LongestAllSpaceLine; + + while (true) { + advanceWhile(&Scanner::skip_s_space); + if (skip_nb_char(Current) != Current) { + // This line isn't empty, so try and find the indentation. + if (Column <= BlockExitIndent) { // End of the block literal. + IsDone = true; + return true; + } + // We found the block's indentation. + BlockIndent = Column; + if (MaxAllSpaceLineCharacters > BlockIndent) { + setError( + "Leading all-spaces line must be smaller than the block indent", + LongestAllSpaceLine); return false; } + return true; } - Current = i; + if (skip_b_break(Current) != Current && + Column > MaxAllSpaceLineCharacters) { + // Record the longest all-space line in case it's longer than the + // discovered block indent. + MaxAllSpaceLineCharacters = Column; + LongestAllSpaceLine = Current; + } + + // Check for EOF. + if (Current == End) { + IsDone = true; + return true; + } + + if (!consumeLineBreakIfPresent()) { + IsDone = true; + return true; + } + ++LineBreaks; + } + return true; +} + +bool Scanner::scanBlockScalarIndent(unsigned BlockIndent, + unsigned BlockExitIndent, bool &IsDone) { + // Skip the indentation. + while (Column < BlockIndent) { + auto I = skip_s_space(Current); + if (I == Current) + break; + Current = I; ++Column; } - if (Start == Current) { - setError("Got empty block scalar", Start); + if (skip_nb_char(Current) == Current) + return true; + + if (Column <= BlockExitIndent) { // End of the block literal. + IsDone = true; + return true; + } + + if (Column < BlockIndent) { + if (Current != End && *Current == '#') { // Trailing comment. + IsDone = true; + return true; + } + setError("A text line is less indented than the block scalar", Current); return false; } + return true; // A normal text line. +} + +bool Scanner::scanBlockScalar(bool IsLiteral) { + // Eat '|' or '>' + assert(*Current == '|' || *Current == '>'); + skip(1); + + char ChompingIndicator; + unsigned BlockIndent; + bool IsDone = false; + if (!scanBlockScalarHeader(ChompingIndicator, BlockIndent, IsDone)) + return false; + if (IsDone) + return true; + + auto Start = Current; + unsigned BlockExitIndent = Indent < 0 ? 0 : (unsigned)Indent; + unsigned LineBreaks = 0; + if (BlockIndent == 0) { + if (!findBlockScalarIndent(BlockIndent, BlockExitIndent, LineBreaks, + IsDone)) + return false; + } + + // Scan the block's scalars body. + SmallString<256> Str; + while (!IsDone) { + if (!scanBlockScalarIndent(BlockIndent, BlockExitIndent, IsDone)) + return false; + if (IsDone) + break; + + // Parse the current line. + auto LineStart = Current; + advanceWhile(&Scanner::skip_nb_char); + if (LineStart != Current) { + Str.append(LineBreaks, '\n'); + Str.append(StringRef(LineStart, Current - LineStart)); + LineBreaks = 0; + } + + // Check for EOF. + if (Current == End) + break; + + if (!consumeLineBreakIfPresent()) + break; + ++LineBreaks; + } + + if (Current == End && !LineBreaks) + // Ensure that there is at least one line break before the end of file. + LineBreaks = 1; + Str.append(getChompedLineBreaks(ChompingIndicator, LineBreaks, Str), '\n'); + + // New lines may start a simple key. + if (!FlowLevel) + IsSimpleKeyAllowed = true; Token T; - T.Kind = Token::TK_Scalar; + T.Kind = Token::TK_BlockScalar; T.Range = StringRef(Start, Current - Start); + T.Value = Str.str().str(); TokenQueue.push_back(T); return true; } @@ -1607,6 +1845,7 @@ std::string Node::getVerbatimTag() const { case NK_Null: return "tag:yaml.org,2002:null"; case NK_Scalar: + case NK_BlockScalar: // TODO: Tag resolution. return "tag:yaml.org,2002:str"; case NK_Mapping: @@ -2138,6 +2377,11 @@ parse_property: , AnchorInfo.Range.substr(1) , TagInfo.Range , T.Range); + case Token::TK_BlockScalar: + getNext(); + return new (NodeAllocator) + BlockScalarNode(stream.CurrentDoc, AnchorInfo.Range.substr(1), + TagInfo.Range, T.Value, T.Range); case Token::TK_Key: // Don't eat the TK_Key, KeyValueNode expects it. return new (NodeAllocator) diff --git a/test/YAMLParser/spec-09-14.test b/test/YAMLParser/spec-09-14.test index 55d881de433..5f028f920fc 100644 --- a/test/YAMLParser/spec-09-14.test +++ b/test/YAMLParser/spec-09-14.test @@ -1,9 +1,6 @@ -# RUN: yaml-bench -canonical %s 2>&1 | FileCheck %s +# RUN: not yaml-bench -canonical %s 2>&1 | FileCheck %s # -# FIXME: This test should actually fail. Yaml bench should report an error that -# says that the '---' and '...' document start/end markers must not be specified -# as the first content line of a non-indented plain scalar. -# CHECK: !!str +# CHECK: error: Expected a line break after block scalar header --- --- ||| : foo diff --git a/test/YAMLParser/spec-09-18.test b/test/YAMLParser/spec-09-18.test index ac623f9973f..cb05bb3774c 100644 --- a/test/YAMLParser/spec-09-18.test +++ b/test/YAMLParser/spec-09-18.test @@ -1,4 +1,8 @@ -# RUN: yaml-bench -canonical %s +# RUN: yaml-bench -canonical %s | FileCheck %s +# CHECK: !!str "literal\n" +# CHECK: !!str " folded\n" +# CHECK: !!str "keep\n\n" +# CHECK: !!str " strip" - | # Just the style literal diff --git a/test/YAMLParser/spec-09-19.test b/test/YAMLParser/spec-09-19.test index 52aa157137b..f385717b035 100644 --- a/test/YAMLParser/spec-09-19.test +++ b/test/YAMLParser/spec-09-19.test @@ -1,4 +1,6 @@ -# RUN: yaml-bench -canonical %s +# RUN: yaml-bench -canonical %s | FileCheck %s +# CHECK: !!str "literal\n" +# CHECK: !!str "folded\n" - | literal diff --git a/test/YAMLParser/spec-09-20.test b/test/YAMLParser/spec-09-20.test index 86fc7ab9a2e..47c255b8573 100644 --- a/test/YAMLParser/spec-09-20.test +++ b/test/YAMLParser/spec-09-20.test @@ -1,4 +1,8 @@ -# RUN: yaml-bench -canonical %s +# RUN: yaml-bench -canonical %s | FileCheck %s +# CHECK: !!str "detected\n" +# CHECK: !!str "\n\n# detected\n" +# CHECK: !!str " explicit\n" +# CHECK: !!str "\t\ndetected\n" - | detected diff --git a/test/YAMLParser/spec-09-21.test b/test/YAMLParser/spec-09-21.test index 76bc7d6bd31..661f986917a 100644 --- a/test/YAMLParser/spec-09-21.test +++ b/test/YAMLParser/spec-09-21.test @@ -9,4 +9,4 @@ - |1 text -# CHECK: error +# CHECK: 8:2: error: A text line is less indented than the block scalar diff --git a/test/YAMLParser/spec-09-22.test b/test/YAMLParser/spec-09-22.test index b95faa50b5d..726358dd5ba 100644 --- a/test/YAMLParser/spec-09-22.test +++ b/test/YAMLParser/spec-09-22.test @@ -1,6 +1,12 @@ -# RUN: yaml-bench -canonical %s +# RUN: yaml-bench -canonical %s | FileCheck %s +# CHECK: !!str "text" +# CHECK: !!str "text\n" +# CHECK: !!str "text\n\n" strip: |- - text
clip: | - text…keep: |+ - text
 + text +clip: | + text +keep: |+ + text + diff --git a/test/YAMLParser/spec-09-24.test b/test/YAMLParser/spec-09-24.test index f08eae6a80e..9cf51413855 100644 --- a/test/YAMLParser/spec-09-24.test +++ b/test/YAMLParser/spec-09-24.test @@ -1,8 +1,13 @@ -# RUN: yaml-bench -canonical %s +# RUN: yaml-bench -canonical %s | FileCheck %s +# CHECK: ? !!str "strip" +# CHECK: : !!str "" +# CHECK: ? !!str "clip" +# CHECK: : !!str "" +# CHECK: ? !!str "keep" +# CHECK: : !!str "\n" strip: >- clip: > keep: |+ - diff --git a/test/YAMLParser/spec-09-25.test b/test/YAMLParser/spec-09-25.test index b15edb523d2..697b47e9ac7 100644 --- a/test/YAMLParser/spec-09-25.test +++ b/test/YAMLParser/spec-09-25.test @@ -1,4 +1,5 @@ -# RUN: yaml-bench -canonical %s +# RUN: yaml-bench -canonical %s | FileCheck %s +# CHECK: !!str "literal\n\ttext\n" | # Simple block scalar literal diff --git a/test/YAMLParser/spec-09-26.test b/test/YAMLParser/spec-09-26.test index 286740ed39c..c8f31aea2b3 100644 --- a/test/YAMLParser/spec-09-26.test +++ b/test/YAMLParser/spec-09-26.test @@ -1,4 +1,5 @@ -# RUN: yaml-bench -canonical %s +# RUN: yaml-bench -canonical %s | FileCheck %s +# CHECK: !!str "\n\nliteral\n\ntext\n" | diff --git a/test/YAMLParser/spec-09-27.test b/test/YAMLParser/spec-09-27.test deleted file mode 100644 index 286740ed39c..00000000000 --- a/test/YAMLParser/spec-09-27.test +++ /dev/null @@ -1,10 +0,0 @@ -# RUN: yaml-bench -canonical %s - -| - - - literal - - text - - # Comment diff --git a/test/YAMLParser/spec-09-28.test b/test/YAMLParser/spec-09-28.test deleted file mode 100644 index 286740ed39c..00000000000 --- a/test/YAMLParser/spec-09-28.test +++ /dev/null @@ -1,10 +0,0 @@ -# RUN: yaml-bench -canonical %s - -| - - - literal - - text - - # Comment diff --git a/unittests/Support/YAMLParserTest.cpp b/unittests/Support/YAMLParserTest.cpp index 918c2059ea6..d3ee8afeb2d 100644 --- a/unittests/Support/YAMLParserTest.cpp +++ b/unittests/Support/YAMLParserTest.cpp @@ -130,6 +130,33 @@ TEST(YAMLParser, ParsesArrayOfArrays) { ExpectParseSuccess("Array of arrays", "[[]]"); } +TEST(YAMLParser, ParsesBlockLiteralScalars) { + ExpectParseSuccess("Block literal scalar", "test: |\n Hello\n World\n"); + ExpectParseSuccess("Block literal scalar EOF", "test: |\n Hello\n World"); + ExpectParseSuccess("Empty block literal scalar header EOF", "test: | "); + ExpectParseSuccess("Empty block literal scalar", "test: |\ntest2: 20"); + ExpectParseSuccess("Empty block literal scalar 2", "- | \n \n\n \n- 42"); + ExpectParseSuccess("Block literal scalar in sequence", + "- |\n Testing\n Out\n\n- 22"); + ExpectParseSuccess("Block literal scalar in document", + "--- |\n Document\n..."); + ExpectParseSuccess("Empty non indented lines still count", + "- |\n First line\n \n\n Another line\n\n- 2"); + ExpectParseSuccess("Comment in block literal scalar header", + "test: | # Comment \n No Comment\ntest 2: | # Void"); + ExpectParseSuccess("Chomping indicators in block literal scalar header", + "test: |- \n Hello\n\ntest 2: |+ \n\n World\n\n\n"); + ExpectParseSuccess("Indent indicators in block literal scalar header", + "test: |1 \n \n Hello \n World\n"); + ExpectParseSuccess("Chomping and indent indicators in block literals", + "test: |-1\n Hello\ntest 2: |9+\n World"); + ExpectParseSuccess("Trailing comments in block literals", + "test: |\n Content\n # Trailing\n #Comment\ntest 2: 3"); + ExpectParseError("Invalid block scalar header", "test: | failure"); + ExpectParseError("Invalid line indentation", "test: |\n First line\n Error"); + ExpectParseError("Long leading space line", "test: |\n \n Test\n"); +} + TEST(YAMLParser, HandlesEndOfFileGracefully) { ExpectParseError("In string starting with EOF", "[\""); ExpectParseError("In string hitting EOF", "[\" "); diff --git a/utils/yaml-bench/YAMLBench.cpp b/utils/yaml-bench/YAMLBench.cpp index bd5aa152dff..634622a710c 100644 --- a/utils/yaml-bench/YAMLBench.cpp +++ b/utils/yaml-bench/YAMLBench.cpp @@ -96,6 +96,8 @@ static void dumpNode( yaml::Node *n SmallString<32> Storage; StringRef Val = sn->getValue(Storage); outs() << prettyTag(n) << " \"" << yaml::escape(Val) << "\""; + } else if (yaml::BlockScalarNode *BN = dyn_cast(n)) { + outs() << prettyTag(n) << " \"" << yaml::escape(BN->getValue()) << "\""; } else if (yaml::SequenceNode *sn = dyn_cast(n)) { outs() << prettyTag(n) << " [\n"; ++Indent; -- 2.34.1