#!/usr/bin/python # -*- coding: utf-8 -*- # David Trethewey 28-03-2015 Open Source GPL # # A rough and ready hacked together (to a large extent on the train back from # Aberystwyth to Truro) segmentation of Cornish (Kernewek Kemmyn) text # to the syllable level using regular expressions. # # Some code comments are in Cornish, some non-existent. # I will make a better tidied up version and put it on a version-controlled # repository, dreckly. # Also I will try to make one for the Standard Written Form of Cornish # with an idea to use this to do transliteration between the orthographies. # # Usage: python sylabelenn_ranna_kw_versyon_001.py # where is the path to an input file containing # text in Kernewek Kemmyn import nltk import sys import string import re import copy class RannaSyllabelenn: def __init__(self, inputtext): # possybl yw: KB, KBK, B # A really difficult to read regular expression # to match consonant-vowel, conson-vowel-conson or vowel alone # as syllables self.kynsaRegexp = r'(^((bl|br|Bl|Br|kl|Kl|kr|Kr|kn|Kn|kw|Kw|ch|Ch|Dhr?|dhr?|dl|dr|Dl|Dr|fl|Fl|fr?|Fr?|vl|Vl|vr|Vr|vv?|Vv?|gwr?|gwl?|gl|gr|gn?|Gwr?|Gwl?|Gl|Gr|Gn?|hw?|Hw?|pr|pl?|Pr|Pl?|shr?|Shr?|str?|Str?|skr?|Skr?|sbr|Sbr|sp?l?|Sp?l?|thr?|Thr?|tr|Tr|tl|Tl|wr|Wr|wl|Wl|[bkdjlmnrtwyBKDJLMNRTVWY])(ay|aw|eu|ey|ew|iw|oe|oy|ow|ou|uw|yw|[aeoiuy])(lgh|bl|br|bb?|kl|kr|kn|kw|kk?|ch|dhr?|dl|dr|dd?|fl|fr?|ff?|vl|vv?|gg?h|gw|gl|gn?|ll?|mm?|nd|ns|nn?|pr|pl?|pp?|rgh?|rdh?|rth?|rv|rn|rr?|sh|str?|skr?|spr|sp|ss?|th|tl|tt?|[jw])?))|(^((ay|aw|eu|ew|ey|iw|oe|oy|ow|ou|uw|yw|Ay|Aw|Ey|Eu|Ew|Iw|Oe|Oy|Ow|Ou|Uw|Yw|[aeoiuyAEIOUY]))(lgh|bl|bb?|kl|kr|kn|kw|kk?|ch|dhr?|dl|dr|dd?|fl|fr?|ff?|vl|vv?|gg?h|gw|gl|gn?|ll?|mm?|nd|ns|nn?|pr|pl?|pp?|rgh?|rdh?|rth?|rv|rn|rr?|sh|str?|skr?|spr|sp|ss?|th|tl|tt?|[jw])?)|(\-)(.*?)' # some other regular expressions not used at present # these need to be debugged before use #self.dewson_sevel_re = r'ya|ye|yo|yu|wa|we|wi|wo|wy' #self.dewson_kodha_re = r'ay|oe|oy|ey|aw|ew|iw|ow|uw|yw' #self.pennvog_re = r'^(.*?)(ay|aw|ey|eu|ew|iw|oe|oy|ow|ou|uw|yw|Ay|Aw|Ey|Eu|Ew|Iw|Oe|Oy|Ou|Ow|Uw|Yw|[aeoiuyAEIOUY])$' #self.lostkess_re = r'^(.*?)(ch|Ch|gh|Gh|dh|Dh|f|F|gw|Gw|ll?|th|Th|[bkdfghjlmnprstvwBKDFGHJLMNPRSTVW])$' #self.lostKB_re = r'(.*?)(ch|Ch|gh|Gh|dh|Dh|f|F|gw|Gw|ll?|th|Th|[bkdfghjlmnprstvwBKDFGHJLMNPRSTVW])(ay|aw|ey|eu|ew|iw|oe|oy|ow|ou|uw|yw|Ay|Aw|Ey|Eu|Ew|Iw|Oe|Oy|Ow|Ou|Uw|Yw|[aeoiuyAEIOUY])$' #self.lostBK_re = r'(.*?)(ay|aw|ey|eu|ew|iw|oe|oy|ow|ou|uw|yw|Ay|Aw|Ey|Eu|Ew|Iw|Oe|Oy|Ow|Ou|Uw|Yw|[aeoiuyAEIOUY])(ch|gh|Gh|dh|Dh|f|F|gw|Gw|ll?|th|Th|[bkdfghjlmnprstvwBKDFGHJLMNPRSTVW])$' # print inputtext # use NLTK to split input text into words. self.geryow = nltk.word_tokenize(inputtext) # print self.geryow def ranna_syl(self,ger): # divide a word into a list of its syllables # and return this syl_list = [] while ger: # print ger k = kynsa_syl(ger) syl_list.append(k) if len(ger.split(k,1))>1: ger = ger.split(k,1)[1] else: ger = '' return syl_list def diwettha_syl(self,ger, regexp): # find last syllable of a word # not used at present diwettha_syl = '' # po kessonenn ha bogalenn po bogalenn ha kessonenn # An diwettha sylabelenn yw po -KB po -BK dsyl = re.findall(regexp,ger) # print dsyl if not(dsyl == []): diwettha_syl=dsyl[0][1] return diwettha_syl def kynsa_syl(self, ger, regexp): # find 1st syllable of word kynsa_syl = '' # print ger # take off an initial hyphen from the syllable if ger[0] == '-': ger = ger[1:] # print ger ksyl = re.findall(regexp,ger) # print "An kynsa sylabellen yw:", ksyl if not(ksyl == []): kynsa_syl = ksyl[0][1]+ksyl[0][5] return kynsa_syl def diwettha_lytherenn(self,ger): # return last letter of a word d_l = '' if ger[-1].isalpha(): d_l = ger[-1] return d_l class Ger: # class for a word of Cornish text def __init__(self,ger): self.ger = ger # an ger kowal # dilea an dashow - # self.ger = self.ger.replace("-","") # dilea an . ; , ? # strip out punctuation characters self.ger = self.ger.replace(".","") self.ger = self.ger.replace(";","") self.ger = self.ger.replace(",","") self.ger = self.ger.replace("?","") self.ger = self.ger.replace("'","") self.ger = self.ger.replace(" ","") # print ger self.n_sls = 0 # niver sylabelennow self.sls = [] # rol a sylabelennow gergesys = self.ger # rann an ger yw gesys while gergesys != '': k = rannans.kynsa_syl(gergesys,rannans.kynsaRegexp) self.sls.append(copy.copy(k)) if (len(k)>0) and (len(gergesys.split(k,1))>1): gergesys = gergesys.split(k,1)[1] else: gergesys = '' self.n_sls = self.n_sls + 1 def diskwedh(self): # show output for each word print "An ger yw:",self.ger print "Niver a sylabelennow yw:",self.n_sls print "Hag yns i:" print self.sls for i in range(self.n_sls): print i+1, ":",self.sls[i] if __name__ == '__main__': # take the input from a file specified # by a command line argument inputfile = sys.argv[1] inputtext = file(inputfile).read() rannans = RannaSyllabelenn(inputtext) # general test code not used in this version # rannans.profya(rannans.geryow) for i in rannans.geryow: g = Ger(i) if g.ger != '': g.diskwedh() print('\n')