Skip to content

Frog Chunker creates invalid FoLiA #100

@kosloot

Description

@kosloot

Given this FoLiA file F1:

<?xml version="1.0" encoding="UTF-8"?>
<FoLiA xmlns="http://ilk.uvt.nl/folia" xmlns:xlink="http://www.w3.org/1999/xlink" xml:id="issue100" version="2.5">
  <metadata>
    <annotations>
      <text-annotation set="https://raw.githubusercontent.com/proycon/folia/master/setdefinitions/text.foliaset.ttl"/>
      <division-annotation/>
      <sentence-annotation/>
    </annotations>
  </metadata>
  <text xml:id="text">
    <div xml:id="div">
      <s xml:id="s">
	<t>Een Bug? Nee toch?</t>
      </s>
    </div>
  </text>
</FoLiA>

Running frog on this file gives:

$ frog --skip=pmnla -x F1 -X F2

File F2:

<?xml version="1.0" encoding="UTF-8"?>
<FoLiA xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://ilk.uvt.nl/folia" xml:id="issue100" generator="libfolia-v2.15" version="2.5">
  <metadata type="native">
    <annotations>
      <text-annotation set="https://raw.githubusercontent.com/proycon/folia/master/setdefinitions/text.foliaset.ttl"/>
      <division-annotation/>
      <sentence-annotation/>
      <token-annotation alias="tokconfig-nld" set="https://raw.githubusercontent.com/LanguageMachines/uctodata/master/setdefinitions/tokconfig-nld.foliaset.ttl">
        <annotator processor="ucto.1"/>
      </token-annotation>
      <paragraph-annotation>
        <annotator processor="ucto.1"/>
      </paragraph-annotation>
      <quote-annotation>
        <annotator processor="ucto.1"/>
      </quote-annotation>
      <pos-annotation set="http://ilk.uvt.nl/folia/sets/frog-mbpos-cgn">
        <annotator processor="tagger.1"/>
      </pos-annotation>
      <chunking-annotation set="http://ilk.uvt.nl/folia/sets/frog-chunker-nl">
        <annotator processor="IOB.1"/>
      </chunking-annotation>
    </annotations>
    <provenance>
      <processor xml:id="frog.1" begindatetime="2023-05-02T08:40:42" command="frog --skip=pmnla -x F1 -X F2" folia_version="2.5.1" host="kobus" name="frog" user="sloot" version="0.29">
        <processor xml:id="frog.1.generator" folia_version="2.5.1" name="libfolia" type="generator" version="2.15"/>
        <processor xml:id="ucto.1" begindatetime="2023-05-02T08:40:42" name="ucto" version="0.30">
          <processor xml:id="uctodata.1" name="uctodata" type="datasource" version="0.9.1">
            <processor xml:id="uctodata.1.1" name="tokconfig-nld" type="datasource" version="0.2"/>
          </processor>
        </processor>
        <processor xml:id="tagger.1" begindatetime="2023-05-02T08:40:42" name="tagger" version="2.0"/>
        <processor xml:id="IOB.1" begindatetime="2023-05-02T08:40:42" name="IOB" version="2.0"/>
      </processor>
    </provenance>
    <meta id="language">nld</meta>
  </metadata>
  <text xml:id="text">
    <div xml:id="div">
      <s xml:id="s">
        <t>Een Bug? Nee toch?</t>
        <w xml:id="s.w.1" class="WORD">
          <t>Een</t>
          <pos class="LID(onbep,stan,agr)" confidence="0.981771" head="LID">
            <feat class="onbep" subset="lwtype"/>
            <feat class="stan" subset="naamval"/>
            <feat class="agr" subset="npagr"/>
          </pos>
        </w>
        <w xml:id="s.w.2" class="WORD" space="no">
          <t>Bug</t>
          <pos class="SPEC(vreemd)" confidence="1.0" head="SPEC">
            <feat class="vreemd" subset="spectype"/>
          </pos>
        </w>
        <w xml:id="s.w.3" class="PUNCTUATION">
          <t>?</t>
          <pos class="LET()" confidence="1.0" head="LET"/>
        </w>
        <chunking xml:id="s.chunking.1">
          <chunk xml:id="s.chunking.1.chunk.1" class="NP" confidence="1.0">
            <wref id="s.w.1" t="Een"/>
            <wref id="s.w.2" t="Bug"/>
          </chunk>
          <chunk xml:id="s.chunking.1.chunk.2" class="TSW" confidence="1.0">
            <wref id="s.w.4" t="Nee"/>
          </chunk>
          <chunk xml:id="s.chunking.1.chunk.3" class="ADVP" confidence="1.0">
            <wref id="s.w.5" t="toch"/>
          </chunk>
        </chunking>
        <w xml:id="s.w.4" class="WORD">
          <t>Nee</t>
          <pos class="TSW()" confidence="0.978799" head="TSW"/>
        </w>
        <w xml:id="s.w.5" class="WORD" space="no">
          <t>toch</t>
          <pos class="BW()" confidence="0.998829" head="BW"/>
        </w>
        <w xml:id="s.w.6" class="PUNCTUATION">
          <t>?</t>
          <pos class="LET()" confidence="1.0" head="LET"/>
        </w>
      </s>
    </div>
  </text>
</FoLiA>

This is INVALID!
the chunking information is interleaved with the words in the sentence. Leading to a forward reference to w,6 and above
This is NOT supported:

folialint:
F2 failed: XML error: Unresolvable id s.w.4 in WordReference

foliavalidator:

VALIDATION ERROR on full parse by library (stage 2/3), in F2
ParseError: FoLiA exception in handling of <wref> @ line 66 (in parent <chunk> @ parent line 65) : [InvalidReference] s.w.4

Metadata

Metadata

Assignees

Labels

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions