Skip to content

Commit 183da4e

Browse files
committed
Partially cleaned up the code of Czech FixEdeprels.
1 parent dae8dd9 commit 183da4e

File tree

1 file changed

+98
-33
lines changed

1 file changed

+98
-33
lines changed

udapi/block/ud/cs/fixedeprels.py

Lines changed: 98 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -338,6 +338,30 @@ def copy_case_from_adposition(self, node, adposition):
338338
else:
339339
return None
340340

341+
@staticmethod
342+
def compose_edeprel(bdeprel, cdeprel):
343+
"""
344+
Composes enhanced deprel from the basic part and optional case
345+
enhancement.
346+
347+
Parameters
348+
----------
349+
bdeprel : str
350+
Basic deprel (can include subtype, e.g., 'acl:relcl').
351+
cdeprel : TYPE
352+
Case enhancement (can be composed of adposition and morphological
353+
case, e.g., 'k:dat'). It is optional and it can be None or empty
354+
string if there is no case enhancement.
355+
356+
Returns
357+
-------
358+
Full enhanced deprel (str).
359+
"""
360+
edeprel = bdeprel
361+
if cdeprel:
362+
edeprel += ':'+cdeprel
363+
return edeprel
364+
341365
def process_tree(self, tree):
342366
"""
343367
Occasionally the edeprels automatically derived from the Czech basic
@@ -348,64 +372,105 @@ def process_tree(self, tree):
348372
"""
349373
for node in tree.descendants_and_empty:
350374
for edep in node.deps:
351-
m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):', edep['deprel'])
375+
m = re.fullmatch(r'(obl(?::arg)?|nmod|advcl(?::pred)?|acl(?::relcl)?):(.+)', edep['deprel'])
352376
if m:
377+
bdeprel = m.group(1)
378+
cdeprel = m.group(2)
353379
solved = False
354380
# Issues caused by errors in the original annotation must be fixed early.
355381
# Especially if acl|advcl occurs with a preposition that unambiguously
356382
# receives a morphological case in the subsequent steps, and then gets
357383
# flagged as solved.
358-
edep['deprel'] = re.sub(r'^advcl:do(?::gen)?$', r'obl:do:gen', edep['deprel']) # od nevidím do nevidím ###!!! Ale měli bychom opravit i závislost v základním stromu!
359-
edep['deprel'] = re.sub(r'^advcl:pro(?::acc)?$', r'advcl:aby', edep['deprel']) # byl by pro, abychom... ###!!! Opravit i konverzi stromu.
360-
edep['deprel'] = re.sub(r'^advcl:s(?::ins)?$', r'advcl', edep['deprel']) ###!!! "seděli jsme tam s Člověče, nezlob se!" Měla by se opravit konverze stromu.
361-
edep['deprel'] = re.sub(r'^acl:k(?::dat)?$', r'acl', edep['deprel'])
362-
edep['deprel'] = re.sub(r'^advcl:k(?::dat)?$', r'obl:k:dat', edep['deprel']) ###!!! Ale měli bychom opravit i závislost v základním stromu!
363-
edep['deprel'] = re.sub(r'^advcl:místo(?::gen)?$', r'obl:místo:gen', edep['deprel']) # 'v poslední době se množí bysem místo bych'
364-
edep['deprel'] = re.sub(r'^acl:na_způsob(?::gen)?$', r'nmod:na_způsob:gen', edep['deprel']) # 'střídmost na způsob Masarykova "jez dopolosyta"'
365-
edep['deprel'] = re.sub(r'^acl:od(?::gen)?$', r'nmod:od:gen', edep['deprel'])
366-
edep['deprel'] = re.sub(r'^advcl:od(?::gen)?$', r'obl:od:gen', edep['deprel']) # od nevidím do nevidím ###!!! Ale měli bychom opravit i závislost v základním stromu!
367-
edep['deprel'] = re.sub(r'^advcl:podle(?::gen)?$', r'obl:podle:gen', edep['deprel'])
368-
edep['deprel'] = re.sub(r'^advcl:pro(?::acc)?$', r'obl:pro:acc', edep['deprel'])
369-
edep['deprel'] = re.sub(r'^acl:v$', r'nmod:v:loc', edep['deprel'])
370-
edep['deprel'] = re.sub(r'^advcl:v$', r'obl:v:loc', edep['deprel'])
371-
edep['deprel'] = re.sub(r'^advcl:v_duchu?(?::gen)?$', r'obl:v_duchu:gen', edep['deprel'])
372-
edep['deprel'] = re.sub(r'^nmod:když.*$', r'nmod', edep['deprel']) # nadějí když ne na zbohatnutí, tak alespoň na dobrou obživu ###!!! perhaps "když" or "když ne" should be analyzed as "cc" here!
373-
edep['deprel'] = re.sub(r'^obl:ačkoli.*$', r'obl', edep['deprel']) # nadějí když ne na zbohatnutí, tak alespoň na dobrou obživu ###!!! perhaps "když" or "když ne" should be analyzed as "cc" here!
374-
edep['deprel'] = re.sub(r'^obl:jestli(?::gen)?$', r'obl:gen', edep['deprel']) # nevím, jestli osmého nebo devátého září
375-
# Removing 'až' must be done early. The remainder may be 'počátek'
376-
# and we will want to convert it to 'počátkem:gen'.
377-
edep['deprel'] = re.sub(r'^(nmod|obl(?::arg)?):až_(.+):(gen|dat|acc|loc|ins)', r'\1:\2:\3', edep['deprel'])
384+
if re.match(r'advcl', bdeprel):
385+
# The following advcl should in fact be obl.
386+
if re.fullmatch(r'do(?::gen)?', cdeprel): # od nevidím do nevidím ###!!! Ale měli bychom opravit i závislost v základním stromu!
387+
bdeprel = 'obl'
388+
cdeprel = 'do:gen'
389+
elif re.fullmatch(r'k(?::dat)?', cdeprel): ###!!! Ale měli bychom opravit i závislost v základním stromu!
390+
bdeprel = 'obl'
391+
cdeprel = 'k:dat'
392+
elif re.fullmatch(r'místo(?::gen)?', cdeprel): # 'v poslední době se množí bysem místo bych'
393+
bdeprel = 'obl'
394+
cdeprel = 'místo:gen'
395+
elif re.fullmatch(r'od(?::gen)?', cdeprel): # od nevidím do nevidím ###!!! Ale měli bychom opravit i závislost v základním stromu!
396+
bdeprel = 'obl'
397+
cdeprel = 'od:gen'
398+
elif re.fullmatch(r'podle(?::gen)?', cdeprel):
399+
bdeprel = 'obl'
400+
cdeprel = 'podle:gen'
401+
elif re.fullmatch(r's(?::ins)?', cdeprel): ###!!! "seděli jsme tam s Člověče, nezlob se!" Měla by se opravit konverze stromu.
402+
bdeprel = 'obl'
403+
cdeprel = 's:ins'
404+
elif re.fullmatch(r'v_duchu?(?::gen)?', cdeprel):
405+
bdeprel = 'obl'
406+
cdeprel = 'v_duchu:gen'
407+
elif re.fullmatch(r'v', cdeprel):
408+
bdeprel = 'obl'
409+
cdeprel = 'v:loc'
410+
# byl by pro, abychom... ###!!! Opravit i konverzi stromu.
411+
elif re.fullmatch(r'pro(?::acc)?', cdeprel):
412+
cdeprel = 'aby'
413+
elif re.match(r'acl', bdeprel):
414+
# The following acl should in fact be nmod.
415+
if re.fullmatch(r'k(?::dat)?', cdeprel):
416+
bdeprel = 'nmod'
417+
cdeprel = 'k:dat'
418+
elif re.fullmatch(r'na_způsob(?::gen)?', cdeprel): # 'střídmost na způsob Masarykova "jez dopolosyta"'
419+
bdeprel = 'nmod'
420+
cdeprel = 'na_způsob:gen'
421+
elif re.fullmatch(r'od(?::gen)?', cdeprel):
422+
bdeprel = 'nmod'
423+
cdeprel = 'od:gen'
424+
elif re.fullmatch(r'v', cdeprel):
425+
bdeprel = 'nmod'
426+
cdeprel = 'v:loc'
427+
else: # bdeprel is 'obl' or 'nmod'
428+
# The following subordinators should be removed if they occur with nominals.
429+
if re.match(r'(ačkoli|když)', cdeprel): # nadějí když ne na zbohatnutí, tak alespoň na dobrou obživu ###!!! perhaps "když" or "když ne" should be analyzed as "cc" here!
430+
cdeprel = ''
431+
# Removing 'až' must be done early. The remainder may be 'počátek'
432+
# and we will want to convert it to 'počátkem:gen'.
433+
elif re.match(r'až_(.+):(gen|dat|acc|loc|ins)', cdeprel):
434+
cdeprel = re.sub(r'až_(.+):(gen|dat|acc|loc|ins)', r'\1:\2', cdeprel)
435+
elif re.fullmatch(r'jestli(?::gen)?', cdeprel): # nevím, jestli osmého nebo devátého září
436+
cdeprel = 'gen'
437+
edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel)
378438
# If one of the following expressions occurs followed by another preposition
379439
# or by morphological case, remove the additional case marking. For example,
380440
# 'jako_v' becomes just 'jako'.
381441
for x in self.outermost:
382442
exceptions = self.outermost[x]
383-
m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'([_:].+)?$', edep['deprel'])
384-
if m and m.group(2) and not x+m.group(2) in exceptions:
385-
edep['deprel'] = m.group(1)+':'+x
443+
m = re.fullmatch(x+r'([_:].+)?', cdeprel)
444+
if m and m.group(1) and not x+m.group(1) in exceptions:
445+
cdeprel = x
446+
edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel)
386447
solved = True
387448
break
388449
if solved:
389450
continue
390451
for x in self.unambiguous:
391452
# All secondary prepositions have only one fixed morphological case
392453
# they appear with, so we can replace whatever case we encounter with the correct one.
393-
m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'(?::(?:nom|gen|dat|acc|voc|loc|ins))?$', edep['deprel'])
454+
m = re.fullmatch(x+r'(?::(?:nom|gen|dat|acc|voc|loc|ins))?', cdeprel)
394455
if m:
395-
edep['deprel'] = m.group(1)+':'+self.unambiguous[x]
456+
cdeprel = self.unambiguous[x]
457+
edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel)
396458
solved = True
397459
break
398460
if solved:
399461
continue
400462
# The following prepositions have more than one morphological case
401463
# available. Thanks to the Case feature on prepositions, we can
402464
# identify the correct one.
403-
m = re.match(r'^(obl(?::arg)?|nmod):(mezi|na|nad|o|po|pod|před|v|za)(?::(?:nom|gen|dat|voc))?$', edep['deprel'])
404-
if m:
405-
adpcase = self.copy_case_from_adposition(node, m.group(2))
406-
if adpcase and not re.search(r':(nom|gen|dat|voc)$', adpcase):
407-
edep['deprel'] = m.group(1)+':'+adpcase
408-
continue
465+
if re.match(r'(obl|nmod)', bdeprel):
466+
m = re.fullmatch(r'(mezi|na|nad|o|po|pod|před|v|za)(?::(?:nom|gen|dat|voc))?', cdeprel)
467+
if m:
468+
adpcase = self.copy_case_from_adposition(node, m.group(1))
469+
if adpcase and not re.search(r':(nom|gen|dat|voc)$', adpcase):
470+
cdeprel = adpcase
471+
edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel)
472+
continue
473+
###!!! bdeprel and cdeprel are not visible from here on but we may want to use them there as well.
409474
if re.match(r'^(acl|advcl):', edep['deprel']):
410475
# We do not include 'i' in the list of redundant prefixes because we want to preserve 'i když' (but we want to discard the other combinations).
411476
edep['deprel'] = re.sub(r'^(acl|advcl):(?:a|alespoň|až|jen|hlavně|například|ovšem_teprve|protože|teprve|totiž|zejména)_(aby|až|jestliže|když|li|pokud|protože|že)$', r'\1:\2', edep['deprel'])
@@ -427,7 +492,7 @@ def process_tree(self, tree):
427492
node.feats['VerbForm'] = ''
428493
node.feats['Voice'] = ''
429494
elif re.match(r'^(nmod|obl(:arg)?):', edep['deprel']):
430-
if edep['deprel'] == 'nmod:loc' and node.parent.feats['Case'] == 'Loc' or edep['deprel'] == 'nmod:voc' and node.parent.feats['Case'] == 'Voc':
495+
if edep['deprel'] == 'nmod:loc' and (node.parent == None or node.parent.feats['Case'] == 'Loc') or edep['deprel'] == 'nmod:voc' and node.parent.feats['Case'] == 'Voc':
431496
# This is a same-case noun-noun modifier, which just happens to be in the locative.
432497
# For example, 'v Ostravě-Porubě', 'Porubě' is attached to 'Ostravě', 'Ostravě' has
433498
# nmod:v:loc, which is OK, but for 'Porubě' the case does not say anything significant.

0 commit comments

Comments
 (0)