Package pyarabic :: Module araby
[hide private]
[frames] | no frames]

Source Code for Module pyarabic.araby

  1  #!/usr/bin/python 
  2  # -*- coding=utf-8 -*- 
  3  #--- 
  4  # 
  5  # ------------ 
  6  # Description: 
  7  # ------------ 
  8  # 
  9  # Arabic codes 
 10  # 
 11  # (C) Copyright 2010, Taha Zerrouki 
 12  # ----------------- 
 13  #  $Date: 2010/03/01 
 14  #  $Author: Taha Zerrouki$ 
 15  #  $Revision: 0.1 $ 
 16  #  This program is written under the Gnu Public License. 
 17  # 
 18  """ 
 19  Arabic module 
 20  @author: Taha Zerrouki 
 21  @contact: taha dot zerrouki at gmail dot com 
 22  @copyright: Arabtechies, Arabeyes,  Taha Zerrouki 
 23  @license: GPL 
 24  @date:2010/03/01 
 25  @version: 0.1 
 26  """ 
 27  import re 
 28  from stack import * 
 29  #class araby: 
 30  """ 
 31  the arabic chars contains all arabic letters, a sub class of unicode, 
 32  """ 
 33   
 34  COMMA            = u'\u060C' 
 35  SEMICOLON        = u'\u061B' 
 36  QUESTION         = u'\u061F' 
 37  HAMZA            = u'\u0621' 
 38  ALEF_MADDA       = u'\u0622' 
 39  ALEF_HAMZA_ABOVE = u'\u0623' 
 40  WAW_HAMZA        = u'\u0624' 
 41  ALEF_HAMZA_BELOW = u'\u0625' 
 42  YEH_HAMZA        = u'\u0626' 
 43  ALEF             = u'\u0627' 
 44  BEH              = u'\u0628' 
 45  TEH_MARBUTA      = u'\u0629' 
 46  TEH              = u'\u062a' 
 47  THEH             = u'\u062b' 
 48  JEEM             = u'\u062c' 
 49  HAH              = u'\u062d' 
 50  KHAH             = u'\u062e' 
 51  DAL              = u'\u062f' 
 52  THAL             = u'\u0630' 
 53  REH              = u'\u0631' 
 54  ZAIN             = u'\u0632' 
 55  SEEN             = u'\u0633' 
 56  SHEEN            = u'\u0634' 
 57  SAD              = u'\u0635' 
 58  DAD              = u'\u0636' 
 59  TAH              = u'\u0637' 
 60  ZAH              = u'\u0638' 
 61  AIN              = u'\u0639' 
 62  GHAIN            = u'\u063a' 
 63  TATWEEL          = u'\u0640' 
 64  FEH              = u'\u0641' 
 65  QAF              = u'\u0642' 
 66  KAF              = u'\u0643' 
 67  LAM              = u'\u0644' 
 68  MEEM             = u'\u0645' 
 69  NOON             = u'\u0646' 
 70  HEH              = u'\u0647' 
 71  WAW              = u'\u0648' 
 72  ALEF_MAKSURA     = u'\u0649' 
 73  YEH              = u'\u064a' 
 74  MADDA_ABOVE      = u'\u0653' 
 75  HAMZA_ABOVE      = u'\u0654' 
 76  HAMZA_BELOW      = u'\u0655' 
 77  ZERO             = u'\u0660' 
 78  ONE              = u'\u0661' 
 79  TWO              = u'\u0662' 
 80  THREE            = u'\u0663' 
 81  FOUR             = u'\u0664' 
 82  FIVE             = u'\u0665' 
 83  SIX              = u'\u0666' 
 84  SEVEN            = u'\u0667' 
 85  EIGHT            = u'\u0668' 
 86  NINE             = u'\u0669' 
 87  PERCENT          = u'\u066a' 
 88  DECIMAL          = u'\u066b' 
 89  THOUSANDS        = u'\u066c' 
 90  STAR             = u'\u066d' 
 91  MINI_ALEF        = u'\u0670' 
 92  ALEF_WASLA       = u'\u0671' 
 93  FULL_STOP        = u'\u06d4' 
 94  BYTE_ORDER_MARK  = u'\ufeff' 
 95   
 96  # Diacritics 
 97  FATHATAN         = u'\u064b' 
 98  DAMMATAN         = u'\u064c' 
 99  KASRATAN         = u'\u064d' 
100  FATHA            = u'\u064e' 
101  DAMMA            = u'\u064f' 
102  KASRA            = u'\u0650' 
103  SHADDA           = u'\u0651' 
104  SUKUN            = u'\u0652' 
105   
106  # Small Letters 
107  SMALL_ALEF      =u"\u0670" 
108  SMALL_WAW       =u"\u06E5" 
109  SMALL_YEH       =u"\u06E6" 
110  #Ligatures 
111  LAM_ALEF                    =u'\ufefb' 
112  LAM_ALEF_HAMZA_ABOVE        =u'\ufef7' 
113  LAM_ALEF_HAMZA_BELOW        =u'\ufef9' 
114  LAM_ALEF_MADDA_ABOVE        =u'\ufef5' 
115  simple_LAM_ALEF             =u'\u0644\u0627' 
116  simple_LAM_ALEF_HAMZA_ABOVE =u'\u0644\u0623' 
117  simple_LAM_ALEF_HAMZA_BELOW =u'\u0644\u0625' 
118  simple_LAM_ALEF_MADDA_ABOVE =u'\u0644\u0622' 
119  # groups 
120  LETTERS=u''.join([ 
121          ALEF , BEH , TEH  , TEH_MARBUTA  , THEH  , JEEM  , HAH , KHAH , 
122          DAL   , THAL  , REH   , ZAIN  , SEEN   , SHEEN  , SAD , DAD , TAH   , ZAH   , 
123          AIN   , GHAIN   , FEH  , QAF , KAF , LAM , MEEM , NOON, HEH , WAW, YEH  , 
124          HAMZA  ,  ALEF_MADDA , ALEF_HAMZA_ABOVE , WAW_HAMZA   , ALEF_HAMZA_BELOW  , YEH_HAMZA  , 
125          ]) 
126   
127  TASHKEEL =(FATHATAN, DAMMATAN, KASRATAN, 
128              FATHA,DAMMA,KASRA, 
129              SUKUN, 
130              SHADDA); 
131  HARAKAT =(  FATHATAN,   DAMMATAN,   KASRATAN, 
132              FATHA,  DAMMA,  KASRA, 
133              SUKUN 
134              ); 
135  SHORTHARAKAT =( FATHA,  DAMMA,  KASRA, SUKUN); 
136   
137  TANWIN =(FATHATAN,  DAMMATAN,   KASRATAN); 
138   
139   
140  LIGUATURES=( 
141              LAM_ALEF, 
142              LAM_ALEF_HAMZA_ABOVE, 
143              LAM_ALEF_HAMZA_BELOW, 
144              LAM_ALEF_MADDA_ABOVE, 
145              ); 
146  HAMZAT=( 
147              HAMZA, 
148              WAW_HAMZA, 
149              YEH_HAMZA, 
150              HAMZA_ABOVE, 
151              HAMZA_BELOW, 
152              ALEF_HAMZA_BELOW, 
153              ALEF_HAMZA_ABOVE, 
154              ); 
155  ALEFAT=( 
156              ALEF, 
157              ALEF_MADDA, 
158              ALEF_HAMZA_ABOVE, 
159              ALEF_HAMZA_BELOW, 
160              ALEF_WASLA, 
161              ALEF_MAKSURA, 
162              SMALL_ALEF, 
163   
164          ); 
165  WEAK   = ( ALEF, WAW, YEH, ALEF_MAKSURA); 
166  YEHLIKE= ( YEH,  YEH_HAMZA,  ALEF_MAKSURA,   SMALL_YEH  ); 
167   
168  WAWLIKE     =   ( WAW,  WAW_HAMZA,  SMALL_WAW ); 
169  TEHLIKE     =   ( TEH,  TEH_MARBUTA ); 
170   
171  SMALL   =( SMALL_ALEF, SMALL_WAW, SMALL_YEH) 
172  MOON =(HAMZA            , 
173          ALEF_MADDA       , 
174          ALEF_HAMZA_ABOVE , 
175          ALEF_HAMZA_BELOW , 
176          ALEF             , 
177          BEH              , 
178          JEEM             , 
179          HAH              , 
180          KHAH             , 
181          AIN              , 
182          GHAIN            , 
183          FEH              , 
184          QAF              , 
185          KAF              , 
186          MEEM             , 
187          HEH              , 
188          WAW              , 
189          YEH 
190      ); 
191  SUN=( 
192          TEH              , 
193          THEH             , 
194          DAL              , 
195          THAL             , 
196          REH              , 
197          ZAIN             , 
198          SEEN             , 
199          SHEEN            , 
200          SAD              , 
201          DAD              , 
202          TAH              , 
203          ZAH              , 
204          LAM              , 
205          NOON             , 
206      ); 
207  AlphabeticOrder={ 
208                  ALEF             : 1, 
209                  BEH              : 2, 
210                  TEH              : 3, 
211                  TEH_MARBUTA      : 3, 
212                  THEH             : 4, 
213                  JEEM             : 5, 
214                  HAH              : 6, 
215                  KHAH             : 7, 
216                  DAL              : 8, 
217                  THAL             : 9, 
218                  REH              : 10, 
219                  ZAIN             : 11, 
220                  SEEN             : 12, 
221                  SHEEN            : 13, 
222                  SAD              : 14, 
223                  DAD              : 15, 
224                  TAH              : 16, 
225                  ZAH              : 17, 
226                  AIN              : 18, 
227                  GHAIN            : 19, 
228                  FEH              : 20, 
229                  QAF              : 21, 
230                  KAF              : 22, 
231                  LAM              : 23, 
232                  MEEM             : 24, 
233                  NOON             : 25, 
234                  HEH              : 26, 
235                  WAW              : 27, 
236                  YEH              : 28, 
237                  HAMZA            : 29, 
238   
239                  ALEF_MADDA       : 29, 
240                  ALEF_HAMZA_ABOVE : 29, 
241                  WAW_HAMZA        : 29, 
242                  ALEF_HAMZA_BELOW : 29, 
243                  YEH_HAMZA        : 29, 
244                  } 
245  NAMES ={ 
246                  ALEF             :  u"ألف", 
247                  BEH              : u"باء", 
248                  TEH              : u'تاء' , 
249                  TEH_MARBUTA      : u'تاء مربوطة' , 
250                  THEH             : u'ثاء' , 
251                  JEEM             : u'جيم' , 
252                  HAH              : u'حاء' , 
253                  KHAH             : u'خاء' , 
254                  DAL              : u'دال' , 
255                  THAL             : u'ذال' , 
256                  REH              : u'راء' , 
257                  ZAIN             : u'زاي' , 
258                  SEEN             : u'سين' , 
259                  SHEEN            : u'شين' , 
260                  SAD              : u'صاد' , 
261                  DAD              : u'ضاد' , 
262                  TAH              : u'طاء' , 
263                  ZAH              : u'ظاء' , 
264                  AIN              : u'عين' , 
265                  GHAIN            : u'غين' , 
266                  FEH              : u'فاء' , 
267                  QAF              : u'قاف' , 
268                  KAF              : u'كاف' , 
269                  LAM              : u'لام' , 
270                  MEEM             : u'ميم' , 
271                  NOON             : u'نون' , 
272                  HEH              : u'هاء' , 
273                  WAW              : u'واو' , 
274                  YEH              : u'ياء' , 
275                  HAMZA            : u'همزة' , 
276   
277                  TATWEEL          : u'تطويل' , 
278                  ALEF_MADDA       : u'ألف ممدودة' , 
279                  ALEF_MAKSURA      : u'ألف مقصورة' , 
280                  ALEF_HAMZA_ABOVE : u'همزة على الألف' , 
281                  WAW_HAMZA        : u'همزة على الواو' , 
282                  ALEF_HAMZA_BELOW : u'همزة تحت الألف' , 
283                  YEH_HAMZA        : u'همزة على الياء' , 
284                  FATHATAN         : u'فتحتان', 
285                  DAMMATAN         : u'ضمتان', 
286                  KASRATAN         : u'كسرتان', 
287                  FATHA            : u'فتحة', 
288                  DAMMA            : u'ضمة', 
289                  KASRA            : u'كسرة', 
290                  SHADDA           : u'شدة', 
291                  SUKUN            : u'سكون', 
292                  } 
293   
294  # regular expretion 
295  HARAKAT_pattern =re.compile(ur"["+u"".join(HARAKAT)+u"]") 
296  TASHKEEL_pattern =re.compile(ur"["+u"".join(TASHKEEL)+u"]") 
297  HAMZAT_pattern =re.compile(ur"["+u"".join(HAMZAT)+u"]"); 
298  ALEFAT_pattern =re.compile(ur"["+u"".join(ALEFAT)+u"]"); 
299  LIGUATURES_pattern =re.compile(ur"["+u"".join(LIGUATURES)+u"]"); 
300   
301  ################################################ 
302  #{ is letter functions 
303  ################################################ 
304 -def isSukun(archar):
305 """Checks for Arabic Sukun Mark. 306 @param archar: arabic unicode char 307 @type archar: unicode 308 """ 309 if archar==SUKUN: 310 return True; 311 else: return False;
312
313 -def isShadda(archar):
314 """Checks for Arabic Shadda Mark. 315 @param archar: arabic unicode char 316 @type archar: unicode 317 """ 318 if archar==SHADDA: 319 return True; 320 else: return False;
321
322 -def isTatweel(archar):
323 """Checks for Arabic Tatweel letter modifier. 324 @param archar: arabic unicode char 325 @type archar: unicode 326 """ 327 if archar==TATWEEL: 328 return True; 329 else: return False;
330 -def isTanwin(archar):
331 """Checks for Arabic Tanwin Marks (FATHATAN, DAMMATAN, KASRATAN). 332 @param archar: arabic unicode char 333 @type archar: unicode 334 """ 335 if archar in TANWIN: 336 return True; 337 else: return False;
338
339 -def isTashkeel(archar):
340 """Checks for Arabic Tashkeel Marks (FATHA,DAMMA,KASRA, SUKUN, SHADDA, FATHATAN,DAMMATAN, KASRATAn). 341 @param archar: arabic unicode char 342 @type archar: unicode 343 """ 344 if archar in TASHKEEL: 345 return True; 346 else: return False;
347
348 -def isHaraka(archar):
349 """Checks for Arabic Harakat Marks (FATHA,DAMMA,KASRA,SUKUN,TANWIN). 350 @param archar: arabic unicode char 351 @type archar: unicode 352 """ 353 if archar in HARAKAT: 354 return True; 355 else: return False;
356
357 -def isShortharaka(archar):
358 """Checks for Arabic short Harakat Marks (FATHA,DAMMA,KASRA,SUKUN). 359 @param archar: arabic unicode char 360 @type archar: unicode 361 """ 362 if archar in SHORTHARAKAT: 363 return True; 364 else: return False;
365
366 -def isLigature(archar):
367 """Checks for Arabic Ligatures like LamAlef. 368 (LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE) 369 @param archar: arabic unicode char 370 @type archar: unicode 371 """ 372 if archar in LIGUATURES: 373 return True; 374 else: return False;
375
376 -def isHamza(archar):
377 """Checks for Arabic Hamza forms. 378 HAMZAT are (HAMZA, WAW_HAMZA, YEH_HAMZA, HAMZA_ABOVE, HAMZA_BELOW,ALEF_HAMZA_BELOW, ALEF_HAMZA_ABOVE ) 379 @param archar: arabic unicode char 380 @type archar: unicode 381 """ 382 if archar in HAMZAT: 383 return True; 384 else: return False;
385
386 -def isAlef(archar):
387 """Checks for Arabic Alef forms. 388 ALEFAT=(ALEF, ALEF_MADDA, ALEF_HAMZA_ABOVE, ALEF_HAMZA_BELOW,ALEF_WASLA, ALEF_MAKSURA ); 389 @param archar: arabic unicode char 390 @type archar: unicode 391 """ 392 if archar in ALEFAT: 393 return True; 394 else: return False;
395
396 -def isYehlike(archar):
397 """Checks for Arabic Yeh forms. 398 Yeh forms : YEH, YEH_HAMZA, SMALL_YEH, ALEF_MAKSURA 399 @param archar: arabic unicode char 400 @type archar: unicode 401 """ 402 if archar in YEHLIKE: 403 return True; 404 else: return False;
405
406 -def isWawlike(archar):
407 """Checks for Arabic Waw like forms. 408 Waw forms : WAW, WAW_HAMZA, SMALL_WAW 409 @param archar: arabic unicode char 410 @type archar: unicode 411 """ 412 if archar in WAWLIKE: 413 return True; 414 else: return False;
415
416 -def isTeh(archar):
417 """Checks for Arabic Teh forms. 418 Teh forms : TEH, TEH_MARBUTA 419 @param archar: arabic unicode char 420 @type archar: unicode 421 """ 422 if archar in TEHLIKE: 423 return True; 424 else: return False;
425 -def isSmall(archar):
426 """Checks for Arabic Small letters. 427 SMALL Letters : SMALL ALEF, SMALL WAW, SMALL YEH 428 @param archar: arabic unicode char 429 @type archar: unicode 430 """ 431 if archar in SMALL: 432 return True; 433 else: return False;
434
435 -def isWeak(archar):
436 """Checks for Arabic Weak letters. 437 Weak Letters : ALEF, WAW, YEH, ALEF_MAKSURA 438 @param archar: arabic unicode char 439 @type archar: unicode 440 """ 441 if archar in WEAK: 442 return True; 443 else: return False;
444
445 -def isMoon(archar):
446 """Checks for Arabic Moon letters. 447 Moon Letters : 448 @param archar: arabic unicode char 449 @type archar: unicode 450 """ 451 452 if archar in MOON: 453 return True; 454 else: return False;
455
456 -def isSun(archar):
457 """Checks for Arabic Sun letters. 458 Moon Letters : 459 @param archar: arabic unicode char 460 @type archar: unicode 461 """ 462 if archar in SUN: 463 return True; 464 else: return False;
465 ##################################### 466 #{ general letter functions 467 #####################################
468 -def order(archar):
469 """return Arabic letter order between 1 and 29. 470 Alef order is 1, Yeh is 28, Hamza is 29. 471 Teh Marbuta has the same ordre with Teh, 3. 472 @param archar: arabic unicode char 473 @type archar: unicode 474 @return: arabic order. 475 @rtype: integer; 476 """ 477 if AlphabeticOrder.has_key(archar): 478 return AlphabeticOrder[archar]; 479 else: return 0;
480
481 -def name(archar):
482 """return Arabic letter name in arabic. 483 Alef order is 1, Yeh is 28, Hamza is 29. 484 Teh Marbuta has the same ordre with Teh, 3. 485 @param archar: arabic unicode char 486 @type archar: unicode 487 @return: arabic name. 488 @rtype: unicode; 489 """ 490 if NAMES.has_key(archar): 491 return NAMES[archar]; 492 else: 493 return u'';
494
495 -def arabicrange(self):
496 """return a list of arabic characteres . 497 Return a list of characteres between \u060c to \u0652 498 @return: list of arabic characteres. 499 @rtype: unicode; 500 """ 501 mylist=[]; 502 for i in range(0x0600, 0x00653): 503 try : 504 mylist.append(unichr(i)); 505 except ValueError: 506 pass; 507 return mylist;
508 509 510 ##################################### 511 #{ Has letter functions 512 #####################################
513 -def hasShadda(word):
514 """Checks if the arabic word contains shadda. 515 @param word: arabic unicode char 516 @type word: unicode 517 """ 518 if re.search(SHADDA,word): 519 return True; 520 else: 521 return False;
522 523 ##################################### 524 #{ word and text functions 525 #####################################
526 -def isVocalized(word):
527 """Checks if the arabic word is vocalized. 528 the word musn't have any spaces and pounctuations. 529 @param word: arabic unicode char 530 @type word: unicode 531 """ 532 if word.isalpha(): return False; 533 # n (FATHA,DAMMAN,KASRA): 534 else: 535 if re.search(HARAKAT_pattern,word): 536 return True; 537 else: 538 return False;
539 -def isVocalizedtext(text):
540 """Checks if the arabic text is vocalized. 541 The text can contain many words and spaces 542 @param text: arabic unicode char 543 @type text: unicode 544 """ 545 if re.search(HARAKAT_pattern,text): 546 return True; 547 else: 548 return False;
549 -def isArabicstring(text):
550 """ Checks for an Arabic standard Unicode block characters; 551 An arabic string can contain spaces, digits and pounctuation. 552 but only arabic standard characters, not extended arabic 553 @param text: input text 554 @type text: unicode 555 @return: True if all charaters are in Arabic block 556 @rtype: Boolean 557 """ 558 if re.search(u"([^\u0600-\u0652%s%s%s\s\d])"%(LAM_ALEF, LAM_ALEF_HAMZA_ABOVE,LAM_ALEF_MADDA_ABOVE),text): 559 return False; 560 return True;
561
562 -def isArabicrange(text):
563 """ Checks for an Arabic Unicode block characters; 564 @param text: input text 565 @type text: unicode 566 @return: True if all charaters are in Arabic block 567 @rtype: Boolean 568 """ 569 if re.search(u"([^\u0600-\u06ff\ufb50-\ufdff\ufe70-\ufeff\u0750-\u077f])",text): 570 return False; 571 return True;
572
573 -def isArabicword(word):
574 """ Checks for an valid Arabic word. 575 An Arabic word not contains spaces, digits and pounctuation 576 avoid some spelling error, TEH_MARBUTA must be at the end. 577 @param word: input word 578 @type word: unicode 579 @return: True if all charaters are in Arabic block 580 @rtype: Boolean 581 """ 582 if len(word)==0 : return False; 583 elif re.search(u"([^\u0600-\u0652%s%s%s])"%(LAM_ALEF, LAM_ALEF_HAMZA_ABOVE,LAM_ALEF_MADDA_ABOVE),word): 584 return False; 585 elif isHaraka(word[0]) or word[0] in (WAW_HAMZA,YEH_HAMZA): 586 return False; 587 # if Teh Marbuta or Alef_Maksura not in the end 588 elif re.match(u"^(.)*[%s](.)+$"%ALEF_MAKSURA,word): 589 return False; 590 elif re.match(u"^(.)*[%s]([^%s%s%s])(.)+$"%(TEH_MARBUTA,DAMMA,KASRA,FATHA),word): 591 return False; 592 else: 593 return True;
594 ##################################### 595 #{Char functions 596 #####################################
597 -def firstChar(word):
598 """ 599 Return the first char 600 @param word: given word; 601 @type word: unicode; 602 @return: the first char 603 @rtype: unicode char; 604 """ 605 return word[0];
606 -def secondChar(word):
607 """ 608 Return the second char 609 @param word: given word; 610 @type word: unicode; 611 @return: the first char 612 @rtype: unicode char; 613 """ 614 return word[1:2];
615 -def lastChar(word):
616 """ 617 Return the last letter 618 example: zerrouki; 'i' is the last. 619 @param word: given word; 620 @type word: unicode; 621 @return: the last letter 622 @rtype: unicode char; 623 """ 624 return word[-1:];
625 -def secondlastChar(word):
626 """ 627 Return the second last letter 628 example: zerrouki; 'k' is the second last. 629 @param word: given word; 630 @type word: unicode; 631 @return: the second last letter 632 @rtype: unicode char; 633 """ 634 return word[-2:-1]
635 ##################################### 636 #{Strip functions 637 #####################################
638 -def stripHarakat(text):
639 """Strip Harakat from arabic word except Shadda. 640 The striped marks are : 641 - FATHA, DAMMA, KASRA 642 - SUKUN 643 - FATHATAN, DAMMATAN, KASRATAN, , , . 644 Example: 645 >>> text=u"الْعَرَبِيّةُ" 646 >>> stripTashkeel(text) 647 العربيّة 648 649 @param text: arabic text. 650 @type text: unicode. 651 @return: return a striped text. 652 @rtype: unicode. 653 """ 654 return re.sub(HARAKAT_pattern,u'',text)
655
656 -def stripTashkeel(text):
657 """Strip vowels from a text, include Shadda. 658 The striped marks are : 659 - FATHA, DAMMA, KASRA 660 - SUKUN 661 - SHADDA 662 - FATHATAN, DAMMATAN, KASRATAN, , , . 663 Example: 664 >>> text=u"الْعَرَبِيّةُ" 665 >>> stripTashkeel(text) 666 العربية 667 668 @param text: arabic text. 669 @type text: unicode. 670 @return: return a striped text. 671 @rtype: unicode. 672 """ 673 return re.sub(TASHKEEL_pattern,'',text);
674
675 -def stripTatweel(text):
676 """ 677 Strip tatweel from a text and return a result text. 678 679 Example: 680 >>> text=u"العـــــربية" 681 >>> stripTatweel(text) 682 العربية 683 684 @param text: arabic text. 685 @type text: unicode. 686 @return: return a striped text. 687 @rtype: unicode. 688 """ 689 return re.sub(TATWEEL,'',text);
690
691 -def normalizeLigature(text):
692 """Normalize Lam Alef ligatures into two letters (LAM and ALEF), and Tand return a result text. 693 Some systems present lamAlef ligature as a single letter, this function convert it into two letters, 694 The converted letters into LAM and ALEF are : 695 - LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE 696 697 Example: 698 >>> text=u"لانها لالء الاسلام" 699 >>> normalizeLigature(text) 700 لانها لالئ الاسلام 701 702 @param text: arabic text. 703 @type text: unicode. 704 @return: return a converted text. 705 @rtype: unicode. 706 """ 707 return LIGUATURES_pattern.sub(u'%s%s'%(LAM,ALEF), text)
708 -def normalizeHamza(word):
709 """Standardize the Hamzat into one form of hamza, 710 replace Madda by hamza and alef. 711 Replace the LamAlefs by simplified letters. 712 Example: 713 >>> text=u"سئل أحد الأئمة" 714 >>> normalizeHamza(text) 715 سءل ءحد الءءمة 716 717 @param word: arabic text. 718 @type word: unicode. 719 @return: return a converted text. 720 @rtype: unicode. 721 """ 722 HAMZAT= u"إأءئؤ"; 723 if word.startswith(ALEF_MADDA): 724 if len(word)>=3 and (word[1] not in HARAKAT) and (word[2]==SHADDA or len(word)==3): 725 word=HAMZA+ALEF+word[1:]; 726 else: 727 word=HAMZA+HAMZA+word[1:]; 728 # convert all Hamza from into one form 729 word=word.replace(ALEF_MADDA,HAMZA+HAMZA); 730 word=HAMZAT_pattern.sub(HAMZA,word); 731 732 return word;
733 734 735
736 -def separate(word):
737 """ 738 separate the letters from the vowels, in arabic word, 739 if a letter hasn't a haraka, the not definited haraka is attributed. 740 return ( letters,vowels); 741 """ 742 #debug=True; 743 stack1=Stack(word) 744 # the word is inversed in the stack 745 stack1.items.reverse(); 746 letters=Stack() 747 marks=Stack() 748 vowels=HARAKAT 749 last1=stack1.pop(); 750 # if the last element must be a letter, 751 # the arabic word can't starts with a haraka 752 # in th stack the word is inversed 753 while last1 in vowels: last1=stack1.pop(); 754 while last1!=None: 755 if last1 in vowels: 756 # we can't have two harakats beside. 757 # the shadda is considered as a letter 758 marks.pop(); 759 marks.push(last1); 760 elif last1==SHADDA: 761 # is the element is a Shadda, 762 # the previous letter must have a sukun as mark, 763 # and the shadda take the indefinate mark 764 marks.pop(); 765 marks.push(SUKUN); 766 marks.push(NOT_DEF_HARAKA); 767 letters.push(SHADDA); 768 else: 769 marks.push(NOT_DEF_HARAKA); 770 letters.push(last1); 771 last1=stack1.pop(); 772 return (''.join(letters.items),''.join(marks.items))
773 774
775 -def joint(letters,marks):
776 """ 777 joint the letters with the marks 778 the length ot letters and marks must be equal 779 return word; 780 """ 781 #debug=True; 782 debug=False; 783 # The length ot letters and marks must be equal 784 if len(letters)!=len(marks): return ""; 785 786 stackLetter=Stack(letters) 787 stackLetter.items.reverse(); 788 stackMark=Stack(marks) 789 stackMark.items.reverse(); 790 wordStack=Stack(); 791 last1=stackLetter.pop(); 792 last2=stackMark.pop(); 793 794 vowels=HARAKAT 795 while last1!=None and last2!=None: 796 if last1 == SHADDA: 797 top=wordStack.pop(); 798 if top not in vowels: 799 wordStack.push(top); 800 wordStack.push(last1); 801 if last2!= NOT_DEF_HARAKA: 802 wordStack.push(last2); 803 else: 804 wordStack.push(last1); 805 if last2!= NOT_DEF_HARAKA: 806 wordStack.push(last2); 807 808 last1=stackLetter.pop(); 809 last2=stackMark.pop(); 810 if not (stackLetter.isEmpty() and stackMark.isEmpty()): 811 return False; 812 else: 813 #wordStack.items.reverse(); 814 return ''.join(wordStack.items);
815
816 -def vocalizedlike(word1,word2):
817 """ 818 if the two words has the same letters and the same harakats, this fuction return True. 819 The two words can be full vocalized, or partial vocalized 820 """ 821 debug=False; 822 stack1=Stack(word1) 823 stack2=Stack(word2) 824 last1=stack1.pop(); 825 last2=stack2.pop(); 826 if debug: print "+0", stack1, stack2; 827 vowels=HARAKAT 828 while last1!=None and last2!=None: 829 if last1==last2: 830 if debug: print "+2", stack1.items,last1, stack2.items,last2 831 last1=stack1.pop(); 832 last2=stack2.pop(); 833 elif last1 in vowels and last2 not in vowels: 834 if debug: print "+2", stack1.items,last1, stack2.items,last2 835 last1=stack1.pop(); 836 elif last1 not in vowels and last2 in vowels: 837 if debug: print "+2", stack1.items,last1, stack2.items,last2 838 last2=stack2.pop(); 839 else: 840 if debug: print "+2", stack1.items,last1, stack2.items,last2 841 break; 842 if not (stack1.isEmpty() and stack2.isEmpty()): 843 return False; 844 else: return True;
845 #------------------------- 846 # Function def vaznlike(word1,wazn): 847 #-------------------------
848 -def waznlike(word1,wazn):
849 """ 850 if the word1 is like a wazn (pattern), 851 the letters must be equal, 852 the wazn has FEH, AIN, LAM letters. 853 this are as generic letters. 854 The two words can be full vocalized, or partial vocalized 855 """ 856 debug=False; 857 stack1=Stack(word1) 858 stack2=Stack(wazn) 859 root=Stack() 860 last1=stack1.pop(); 861 last2=stack2.pop(); 862 if debug: print "+0", stack1, stack2; 863 vowels=HARAKAT 864 while last1!=None and last2!=None: 865 if last1==last2 and last2 not in (FEH, AIN,LAM): 866 if debug: print "+2", stack1.items,last1, stack2.items,last2 867 last1=stack1.pop(); 868 last2=stack2.pop(); 869 elif last1 not in vowels and last2 in (FEH, AIN,LAM): 870 if debug: print "+2", stack1.items,last1, stack2.items,last2 871 root.push(last1); 872 print "t"; 873 last1=stack1.pop(); 874 last2=stack2.pop(); 875 elif last1 in vowels and last2 not in vowels: 876 if debug: print "+2", stack1.items,last1, stack2.items,last2 877 last1=stack1.pop(); 878 elif last1 not in vowels and last2 in vowels: 879 if debug: print "+2", stack1.items,last1, stack2.items,last2 880 last2=stack2.pop(); 881 else: 882 if debug: print "+2", stack1.items,last1, stack2.items,last2 883 break; 884 # reverse the root letters 885 root.items.reverse(); 886 print " the root is ", root.items#"".join(root.items); 887 if not (stack1.isEmpty() and stack2.isEmpty()): 888 return False; 889 else: return True;
890
891 -def shaddalike(partial,fully):
892 """ 893 if the two words has the same letters and the same harakats, this fuction return True. 894 The first word is partially vocalized, the second is fully 895 if the partially contians a shadda, it must be at the same place in the fully 896 """ 897 debug=False; 898 partial=stripHarakat(partial); 899 fully=stripHarakat(fully) 900 Pstack=Stack(partial) 901 Vstack=Stack(fully) 902 Plast=Pstack.pop(); 903 Vlast=Vstack.pop(); 904 if debug: print "+0", Pstack, Vstack; 905 vowels=SHADDA 906 while Plast!=None and Vlast!=None: 907 if Plast==Vlast: 908 if debug: print "+2", Pstack.items,Plast, Vstack.items,Vlast 909 Plast=Pstack.pop(); 910 Vlast=Vstack.pop(); 911 elif Plast ==SHADDA and Vlast !=SHADDA: 912 if debug: print "+2", Pstack.items,Plast, Vstack.items,Vlast 913 break; 914 elif Plast !=SHADDA and Vlast ==SHADDA: 915 if debug: print "+2", Pstack.items,Plast, Vstack.items,Vlast 916 Vlast=Vstack.pop(); 917 else: 918 if debug: print "+2", Pstack.items,Plast, Vstack.items,Vlast 919 break; 920 if not (Pstack.isEmpty() and Vstack.isEmpty()): 921 return False; 922 else: return True;
923 -def reduceTashkeel(text):
924 """ 925 Reduce the Tashkeel, by deleting evident cases. 926 @param text: the input text fully vocalized. 927 @type text: unicode. 928 @return : partially vocalized text. 929 @rtype: unicode. 930 """ 931 reduced=text; 932 # delete all fathat, except on waw and yeh 933 #delete all sukun, except on waw and yeh. 934 reduced=re.sub(u"(?<!(%s|%s))(%s|%s)"%( WAW, YEH, SUKUN, FATHA),'',reduced); 935 936 #delete damma if followed by waw. 937 reduced=re.sub(u"%s(?=%s)"%(DAMMA, WAW),'',reduced); 938 939 #delete kasra if followed by yeh. 940 reduced=re.sub(u"%s(?=%s)"%(KASRA, YEH),'',reduced); 941 942 #delete fatha if followed by alef to reduce yeh maftouha and waw maftouha before alef. 943 reduced=re.sub(u"%s(?=%s)"%(FATHA, ALEF),'',reduced); 944 945 #delete fatha from yeh and waw if they are in the word begining. 946 reduced=re.sub(u"(?<=\s(%s|%s))%s"%(WAW, YEH, FATHA),'',reduced); 947 reduced=re.sub(u"(?<=\A(%s|%s))%s"%(WAW, YEH, FATHA),'',reduced); 948 949 #delete kasra if preceded by Hamza below alef. 950 reduced=re.sub(u"(?<=%s)%s"%(ALEF_HAMZA_BELOW,KASRA),'',reduced); 951 952 return reduced;
953