## This is the master file to generate the TECkit ArabTeX font mappings ## for use with XeTeX ## The mappings are generated with a simple perl script (makemaps.pl) ## ## Conventions used: ## --------------------------------------------------------------------------------- ## Lines starting with ## are comments that don't appear in the map files ## ## Lines that don't begin with # are common to all mappings ## Otherwise the letters following '#' indicate specific mappings ## according to vocalization modes and languages ## #...n -> non-vocalized ## #...v -> vocalized and fully voc. ## #...f -> fully voc. ## @v -> voc only ## The language codes are: ## A -> Arabic # B -> Maghribi Arabic ## P -> Persian ## U -> Urdu ## T -> Pashto ## S -> Sindhi ## K -> Kashmiri ## O -> Ottoman Turkish ## M -> Malay ## KURDISH AND UIGHUR ARE CODED SEPARATELY <<< ## The above can be negated with #-[APUTSKOM] ## Lines starting with '@[AP...][nvf]' are specific to the mapping ## indicated by the two-letter key, following the conventions above ## ---------------------------------------------------------------------------------- @An LHSName "ArabTeX-novowels" @Av LHSName "ArabTeX-vocalized" @Af LHSName "ArabTeX-fullvocalized" @Bn LHSName "ArabTeX-maghribi-novowels" @Bv LHSName "ArabTeX-maghribi-vocalized" @Bf LHSName "ArabTeX-maghribi-fullvocalized" @Pn LHSName "ArabTeX-farsi-novowels" @Pv LHSName "ArabTeX-farsi-vocalized" @Pf LHSName "ArabTeX-farsi-fullvocalized" @Un LHSName "ArabTeX-urdu-novowels" @Uv LHSName "ArabTeX-urdu-vocalized" @Uf LHSName "ArabTeX-urdu-fullvocalized" @Sn LHSName "ArabTeX-sindhi-novowels" @Sv LHSName "ArabTeX-sindhi-vocalized" @Sf LHSName "ArabTeX-sindhi-fullvocalized" @Kn LHSName "ArabTeX-kashmiri-novowels" @Kv LHSName "ArabTeX-kashmiri-vocalized" @Kf LHSName "ArabTeX-kashmiri-fullvocalized" @Tn LHSName "ArabTeX-pashto-novowels" @Tv LHSName "ArabTeX-pashto-vocalized" @Tf LHSName "ArabTeX-pashto-fullvocalized" ## @Kn LHSName "ArabTeX-kurdish-novowels" ## @Kv LHSName "ArabTeX-kurdish-vocalized" ## @Kf LHSName "ArabTeX-kurdish-fullvocalized" @On LHSName "ArabTeX-turk-novowels" @Ov LHSName "ArabTeX-turk-vocalized" @Of LHSName "ArabTeX-turk-fullvocalized" @Mn LHSName "ArabTeX-malay-novowels" @Mv LHSName "ArabTeX-malay-vocalized" @Mf LHSName "ArabTeX-malay-fullvocalized" ## @Rn LHSName "ArabTeX-uighur-novowels" ## @Rv LHSName "ArabTeX-uighur-vocalized" ## @Rf LHSName "ArabTeX-uighur-fullvocalized" ## ## RHSName "UNICODE" Define a U+0061 ;a Define b U+0062 ;b Define c U+0063 ;c Define d U+0064 ;d Define e U+0065 ;e Define f U+0066 ;f Define g U+0067 ;g Define h U+0068 ;h Define i U+0069 ;i Define j U+006A ;j Define k U+006B ;k Define l U+006C ;l Define m U+006D ;m Define n U+006E ;n Define o U+006F ;o Define p U+0070 ;p Define q U+0071 ;q Define r U+0072 ;r Define s U+0073 ;s Define t U+0074 ;t Define u U+0075 ;u Define v U+0076 ;v Define w U+0077 ;w Define x U+0078 ;x Define y U+0079 ;y Define z U+007A ;z Define A U+0041 ;A Define B U+0042 ;B Define E U+0045 ;E Define G U+0047 ;G Define I U+0049 ;I Define N U+004E ;N Define O U+004F ;O Define T U+0054 ;T Define U U+0055 ;U Define W U+0057 ;W Define Y U+0059 ;Y Define C U+005E ;^ << CARET Define UL U+005F ;_ << UNDERLINE Define P U+002E ;. << DOT Define V U+002C ;, << COMMA (_V_irgule) Define CO U+003A ;: << COLON Define SC U+003B ;; << SEMICOLON Define Ain U+0060 ;` Define H U+0027 ;' << HAMZA Define Q U+0022 ;" << QUOTE Define Z U+002D ;- << HYPHEN Define BAR U+007C ;| Define AN U+0061 U+004E ; aN Define LLAH U+006C U+006C U+005F U+0061 U+0068 ; ll_ah ;Define NUL U+00A6; Define Ha U+0623 Define Hi U+0625 Define Hy U+0626 Define Hw U+0624 Define Hh U+06C0 ;; = U+0647 U+0654 Define HA U+0622 Define HB U+0640 U+0654 Define HBAR U+0621 ; '| pass(Unicode) U+E000 > ; eliminate special code for capitalization in transliterations pass(Unicode) Class [VWL] = (U+0061 U+0065 U+0069 U+006F U+0075 ) ;(a e i o u) Class [AVWL] = (U+064B..U+0650); Class [LTR] = (U+0061..U+007A U+0041 U+0045 U+0047 U+0049 U+004F U+0055 U+0042 U+004E U+0054 U+0059 U+005E U+002E U+005F U+0022 U+0027 U+0060) ;a..z A E I O U B N T Y ^ . _ " "" ` Class [BRACKETS] = (U+003C U+003E U+00AB U+00BB U+0028 U+0029 U+005B U+005D U+2018 U+2019 U+201C U+201D); < > () « » [ ] ‘ ’ “ ” Class [PUNCT] = (V SC P CO) Class [Iy] = (U+0049 U+0079) ;(I y) Class [Uw] = (U+0055 U+0077) ;(U w) Class [Uwu] = (U+0055 U+0077 U+0075) ;(U w u) ;Class [ae] = (U+0061 U+0065) ;(a e) ;Class [ou] = (U+0075 U+006F) ;(u o) Class [Iyi] = (U+0069 U+0079 U+0049) Class [UI] = (U+0049 U+0055) ; U I Class [Digits] = (U+0030..U+0039) Define BEG (#|Z|[BRACKETS]) Define END (#|[BRACKETS]|[PUNCT] U+0020|[PUNCT] #) Define ENDZ (#|[BRACKETS]|[PUNCT] U+0020|[PUNCT] #|Z) ; special ligatures ;; 1. Standard ArabTeX input ;; FDF2 is just used as temporary holder, and is converted back to ligature lam lam hah later (to be taken care of by the font) #Af Z (a|A) l Z LLAH > U+0671 U+FDF2 ;; '-al-ll_ah' or '-Al-ll_ah' ;; The BAR is used in the DMG transliteration to suppress the initial A (a|A) BAR? l Z LLAH > U+0627 U+FDF2 ;; 'al-ll_ah' or 'Al-ll_ah' LLAH > U+FDF2 ;; 'll_ah' -> 'l l h' ;; UPPERCASE mnemonics for various ligatures or special glyphs U+004C U+004C U+0048 > U+FDF2 ;; 'LLH' -> 'l l h' U+0046 U+0041 U+004C U+0049 U+004C U+004C U+0041 U+0048 > U+0641 U+064E U+0644 U+0650 U+0644 U+0647 ;; 'FALILLAH' U+0041 U+004C U+004C U+0041 U+0048 > U+0627 U+0644 U+0644 U+0647 ; 'ALLAH' U+004C U+004C U+0041 U+0048 > U+0644 U+0644 U+0647 ; 'LLAH' U+0053 U+004C U+004D > U+FDFA ; 'SLM' : .sallY al-ll_ah `alayhi wa-sallam U+0028 U+0028 > U+FD3F ;; '((' : ornate right par U+0029 U+0029 > U+FD3E ;; '))' : ornate left par U+0052 U+0049 U+0059 U+0041 U+004C > U+FDFC ;; 'RIYAL' : Saudi currency sign ;; most fonts won't have the following ones: U+0053 U+0041 U+004C U+004C U+0041 U+0053 U+0054 U+004F U+0050 > U+FDF0 ;; 'SALLASTOP' (.sallY with "ye-barree", Quranic stop sign) U+0051 U+0041 U+004C U+0041 > U+FDF1 ;; 'QALA' : qalY with ye-barree, Quranic stop sign U+0041 U+004B U+0042 U+0041 U+0052 > U+FDF3 ;; 'AKBAR' U+004D U+0055 U+0048 U+0041 U+004D U+004D U+0041 U+0044 > U+FDF4 ;; 'MUHAMMAD' U+0053 U+0041 U+004C U+0041 U+004D > U+FDF5 ;; 'SALAM' U+0052 U+0041 U+0053 U+004F ? U+0055 U+004C > U+FDF6 ;; 'RASUL' (also unicode 'RASOUL') U+0041 U+004C U+0041 U+0059 U+0048 (U+0049|U+0045) > U+FDF7 ;; 'ALAYHI' (also unicode 'ALAYHE') U+0057 U+0041 U+0053 U+0041 U+004C U+004C U+0041 U+004D > U+FDF8 ;; 'WASALLAM' U+0053 U+0041 U+004C U+004C U+0041 > U+FDF9 ;; 'SALLA' U+004A U+0041 U+004C U+004C U+0041 > U+FDFB ;; 'JALLA' : "jalla jalAluhu" U+0042 U+0041 U+0053 U+004D U+0041 U+004C U+0041 > U+FDFD ;; 'BASMALA' : very few fonts have this ligature afaik U+005E U+0053 U+004C U+0059 > U+06D6 ;; ^SLY U+005E U+0051 U+004C U+0059 > U+06D7 ;; ^QLY U+005E U+004D U+0049 U+004D > U+06D8 ;; ^MIM- U+005E U+004C U+0041 > U+06D9 ;; ^LA U+005E U+004A U+0049 U+004D > U+06DA ;; ^JIM U+005E U+0044 U+004F U+0054 U+0053 > U+06DB ;; ^DOTS U+005E U+0053 U+0049 U+004E > U+06DC ;; ^SIN U+0048 U+0049 U+005A U+0042 > U+06DE ;; HIZB U+0043 U+0049 U+0052 U+0043 U+005A U+0045 U+0052 U+004F > U+06DF ;; CIRCZERO U+0052 U+0045 U+0043 U+0054 U+005A U+0045 U+0052 U+004F > U+06E0 ;; RECTZERO U+005E U+004A U+0041 U+005A U+004D > U+06E1 ;; ^JAZM U+005E U+004D U+0049 U+004D > U+06E2 ;; ^MIM U+005F U+0053 U+0049 U+004E > U+06E3 ;; _SIN U+005E U+004D U+0041 U+0044 U+0044 U+0041 > U+06E4 ;; ^MADDA U+0057 U+0041 U+0057 > U+06E5 ;; WAW U+0059 U+0045 U+0048 > U+06E6 ;; YEH U+005E U+0059 U+0045 U+0048 > U+06E7 ;; ^YEH U+005E U+004E U+0055 U+004E > U+06E8 ;; ^NUN U+0053 U+0041 U+004A U+0044 U+0041 > U+06E9 ;; SAJDA U+005F U+0053 U+0054 U+004F U+0050 > U+06EA ;; _STOP U+005E U+0053 U+0054 U+004F U+0050 > U+06EB ;; ^STOP U+005E U+0052 U+0053 U+0054 U+004F U+0050 > U+06EC ;; ^RSTOP U+005F U+004D U+0049 U+004D > U+06ED ;; _MIM U+0044 U+004F U+0054 U+0053 > U+061E ;; DOTS (also possible by typing "::") ;; This is to enclose digits within glyph U+06DD U+005B U+005B ([Digits]+)=dig U+005D U+005D > U+06DD @dig ; [[digits]] ; vowels u u > U i i > I a a > A #n [AVWL] > ; strip off vowels when writing UTF-8 Arabic with novoc mode #T a e > a U+0626 #T E e > U+0659 U+06CD #T e e > U+0659 U+0626 #T E > U+0659 U+06D0 CO O > U+06FC ; #T O > U+0657 w #T o > U+0657 #U H E > U+06D3 ;; hamza + yeh barree #U E / _ # > U+06D2 #U E > y #U a e / _ # > a U+06D2 #-T a e > a y a o > a w #PUKS (o|u)? O / # _ > U+0627 w #PUKS (o|u)? O > w CO U > U+06C7 ; #-TK e > i ;; but not pashto > U+0659 and kashmiri > y + U+0658 (U+06CE is kurdish) #T e > U+0659 #K e / # _ # > U+0627 U+06D2 U+0658 ; Alif + yeh barree + bow accent #K e / _ # > U+06D2 U+0658 ; yeh barree + bow accent #K e / # _ > U+0627 y U+0658 ; alif + ya + bow accent #K e > y U+0658 #K E / # _ # > U+0627 U+06D2 ; Alif + yeh barree #K E / _ # > U+06D2 ; yeh barree #K E / # _ > U+0627 y ; alif + ya #K E > y #K o / # _ > U+0627 U+06C6 #K o > U+06C6 #-TK o > u #AP E > I #A O > U #K P A > U+0672 #K P U > U+0673 ; > this is the initial or lone form, but medial and final require wavy hamza below previous letter (not in Unicode)! Perhaps an OT font could solve this :: in Scheherazade the glyph needed is named _wavyHamza <<< #K P a / # _ > U+0623 #K P a > U+0654 #K P u > U+0655 #K P o / # _ > U+0627 U+06C4 #K P o > U+06C4 #K P O / # _ > U+0627 U+06C4 U+0627 U+064E #K P O > U+06C4 U+0627 U+064E #K I > y U+0656 #K I / _ # > y #K U / _ # > U+0627 w U+0657 #K U > w U+0657 #-ASMK I / _ # > i y #-A ;; Persian ezafe (ALL BUT ARABIC AND UIGHUR) #-A U+0048 Z UL? (Q? (i|e))=vv / _ END > U+06C0 @vv U+200C ;H-i -> heh+hamza final; what about 06C2 in Urdu ?<<< #-A U+0048 Z Q?=qq (I|E)=vv / _ END > h U+0020 U+0627 @qq i @vv ; H-I #-A Q?=q1 (I|E) Z UL? Q?=q2 (i|e) / _ END > @q1 i U+0626 @q2 i U+200C ; ...I-e #-A Z Q?=qq I / (A|U) _ ENDZ > U+0626 @qq i y ; ...A-I or ...U-I #-A Z UL? (Q? (i|e))=vv / (A|U) _ END > U+0649 @vv ; ...A-i or ...U-i #-A Z Q?=q1 (e|i) / _ ENDZ > @q1 i ; ...-i #-A Z / U+0048 _ > U+0020 ; #-A b Q?=q1 (e|i) Q?=q2 U / # _ > b @q1 i U+0627 @q2 u w ;; special case of prep be without hyphen ;; initial characters ... a l Z / BEG _ > U+0627 l Z #An H l Z / BEG _ > U+0627 l Z @Av H l Z / BEG _ > U+0627 l Z @Bv H l Z / BEG _ > U+0627 l Z #Af H l Z / BEG _ > U+0671 l Z ; THIS IS NOT AN ARABTEX CONVENTION! it's a work-around coz pattern-matching across word boundaries does not work in teckit , so min al- must be written mina 'l- in fullvocalize mode ;; wasla on initial alif : NEW! NOT TESTED! #Af [VWL]=v1 Z / BEG _ > @v1 U+0671 ; THIS IS NOT AN ARABTEX CONVENTION! it's a work-around coz pattern-matching across word boundaries does not work in teckit , so huwa irtifA` must be written huwa i-rtifA` in fullvocalize mode #n H / BEG _ l l > U+0627 ; #v H / BEG _ l l > U+0671 ; e.g. alla_dI -> 'lla_dI #n [VWL]=v1 / BEG _ > A @v1 #v [VWL]=v1 / BEG _ > U+0627 @v1 #An A / BEG _ > A a #Av A / BEG _ > U+0627 a #PU A / BEG _ > HA ;; ADD MORE ? <<< #n Q?=q1 U / BEG _ > A @q1 u w ; -U #v Q?=q1 U / BEG _ > U+0627 @q1 u w ; -U #n ; Q?=q1 I / BEG _ > A @q1 i y ; -I ; but not when I is final FIXME!!! #v ; Q?=q1 I / BEG _ > U+0627 @q1 i y ; -I #-Pn Q?=q1 I / BEG _ > A @q1 i y ; except for Persian, coz of ezafe rules #-Pv Q?=q1 I / BEG _ > U+0627 @q1 i y #Pn Q?=q1 I / BEG _ ^# > A @q1 i y ; -I... #Pv Q?=q1 I / BEG _ ^# > U+0627 @q1 i y ; -I... #P Z Q?=qq (I|E) / _ (#|Z) > @qq i y ; #P UL Q?=q1 U > @q1 U #P Q?=q1 U / # _ # > U+0627 @q1 u w #P Q BAR > U+0020; "| to separate compounds >> IN TRANSLIT THIS WILL BE REMOVED ;; Words with anomalous orthography: ; mi'aT -> ماية ;;; i H / m Q ? _ Q ? a ? (T|t) > i A Hy #n m Q i H Q a / _ (T|t) > m Q i A Hy Q a #n m Q i H a / _ (T|t) > m Q i A Hy #n m i H Q a / _ (T|t) > m A Hy Q a #n m i H a / _ (T|t) > m A Hy #v m Q i H Q a / _ (T|t) > m U+0627 Hy #v m Q i H a / _ (T|t) > m U+0627 Hy a #v m i H Q a / _ (T|t) > m i U+0627 Hy #v m i H a / _ (T|t) > m i U+0627 Hy a #-S #-S ;; contextual analysis of hamzas #-S ;; THESE DO NOT APPLY FOR SINDHI #-S #-S ; initial hamzas #-S H / (#|^[LTR]) _ Q? [Iyi] > Hi #-S H Q A / (#|^[LTR]) _ > HA ; alif madda #-S H / (#|^[LTR]) _ Q? [Uwu] > Ha #-S H / (#|^[LTR]) _ Q? a > Ha #-S Q H / (#|^[LTR]) _ > HB ; quoted isolated hamza at beginning of word #-S #-S ; quoted hamzas in contextual mode, equivalent to verbatim mode #-S a Q H > Ha #-S i Q H > Hi #-S y Q H > Hy #-S w Q H > Hw #-S h Q H > Hh #-S A Q H > HA #-S B Q H > HB #-S BAR Q H > HBAR #-S #-S ; hamza + fathatan #-S H / a _ Q? AN > Ha #-S ;;H / [Iyi] _ Q? AN > Hy #-S H / _ Q? AN > Hy #-S #-S ;final hamzas #-S H / A _ Q? [VWL]? N? (#|^[LTR]) > HBAR #-S H / [Uw] _ Q? [VWL]? N? (#|^[LTR]) > HBAR #-S H / [Iy] _ Q? (i N?|u N?)? (#|^[LTR]) > HBAR ; not AN ... #-S H / a _ Q? (#|^[LTR]) > Ha ; not AN... #-S H / a _ Q? (u|i) N? (#|^[LTR]) > Ha ; not AN... #-S H / i _ Q? [VWL]? N? (#|^[LTR]) > Hy #-S H / u _ Q? [VWL]? N? (#|^[LTR]) > Hw #-S #-S ;;and more hamzas ... #-S #-S ; n°1: kasra/ya #-S H / [LTR] _ Q? [Iyi] ^N > Hy #-S H / [Iyi] _ > Hy #-S #-S ; n°2: damma/waw #-S H / [LTR] _ Q? (U|u) ^N > Hw #-S H / (U|u) _ Q? ^[Iyi] > Hw #-S H / U _ > HBAR #-S #-S ; n°3: fatha/alif #-S H Q? A > HA #-S H / A _ Q? a > HBAR #-S H / ^[UI] _ Q? a > Ha #-S H / a _ Q? ^[UI] > Ha ## ##;; take care of all shaddas at once? NOT WORKING, have to give them one by one ##;; ((V|C|P|UL)=l1 (b|c|d|f|g|h|j|k|l|m|n|p|q|r|s|t|w|x|y|z|Ain|T)=l2){2,2} > @l1 @l2 U+0651 ; ##;; ((b|c|d|f|g|h|j|k|l|m|n|p|q|r|s|t|w|x|y|z|Ain|T)=l1){2,2} > @l1 U+0651 ; ##;; #U ;special rules for Urdu : NO! tashdid realized as ^c^ch instead of ^ch^ch ##;; #U (C (c|g)=x h){2,2} > C @x h U+0651 ##;; #U (V (t|d|r)=x h){2,2} > V @x h U+0651; ##;; #U ((b|p|t|j|d|r|k|g|l|m|n)=x h){2,2} > @x h U+0651 pass(Unicode) Class [Digits] = (U+0030..U+0039) Class [ArDigits] = (U+0660..U+0669) Class [ArEasternDigits] = (U+06F0..U+06F9) Class [VWL] = (U+0061 U+0065 U+0069 U+006F U+0075 ) ;(a e i o u) Class [LTR] = (U+0061..U+007A U+0041 U+0045 U+0049 U+004F U+0055 U+0042 U+004E U+0054 U+0059 U+005E U+002E U+005F U+0022 U+0027 U+0060) ;(a..z A E I O U B N T Y ^ . _ " "" `) Class [Iy] = (U+0049 U+0079) ;(I y) Class [Uw] = (U+0055 U+0077) ;(U w) ##; all hamzas are now in verbatim mode ##;;;H a > U+0623 ##;;;H i > U+0625 ##;;;H y > U+0626 ##;;;H w > U+0624 ##;;;H h > U+0647 U+0654 ##;;;H A > U+0622 ##;;;H B > U+0640 U+0654 ##;;;H BAR > U+0621 ; '| ;normal code ## add code for V S M K .k .l (.p=.f) (.v=.b) .y ## .A .U (.T=Y) ## CHECK ,a ,c ,d ,e ,g ,h ,n ,r ,s ,t ,z ,A ## ^ alone=alif with hazma above, ^a=alif hamza fatha, ^i = alif hamza kasra, ^u=alif damma waw (no hamza) ## CHECK ^d ^e ^i ^l ^n ^z ## CHECK (^A='A) ## CHECK (_k=.g) (_s=_t) (_z=_d) ## IMPLEMENT (:a :e :i :o :u = alif + vowel) ## IMPLEMENT :b :d :g :j :n :s (:t=.t?) (:z=.d?) :O :U #S ;; special code for sindhi #S ;; U+004D U+0049 U+004E > U+06FE ; define macro \MIN in sindhi env #S P m I N > U+06FE ; .mIN #S ;; U+0049 U+004E > U+06FD ; IN define macro \IN in sindhi env #S H BAR I N > U+06FD ; '|IN #K CO z CO z > U+0636 U+0651 ;;SHADDA #K CO z > U+0636 (P z|U+1E93) (P z|U+1E93) > U+0638 U+0651 ;;SHADDA (P z|U+1E93) > U+0638 C z C z > U+0698 U+0651 ;;SHADDA C z > U+0698 #-O V z V z > U+0696 U+0651 ;;SHADDA #-O V z > U+0696 ;pashto #O V z V z > U+0636 U+0651 ;;SHADDA #O V z > U+0636 z z > U+0632 U+0651 ;;SHADDA z > U+0632 #n Y Y > U+0649 U+0651 ;;SHADDA #n Y > U+0649 #v Q Y Y > U+0649 U+0651 ;;SHADDA #v Q Y > U+0649 #v Y Y > U+064E U+0649 U+0651 ;;SHADDA #v Y > U+064E U+0649 #K ;; .b with ring below : FIXME not in Unicode! but U+06EA is a poor replacement: #K P y / # _ # > U+0627 U+06D2 U+06EA #K P y / _ # > U+0649 #K P y > U+066E U+06EA #-ASM ;; y / _ # > U+0649 ; y final in persian,ottoman,urdu,pashto,kashmiri has no dots #-ASM y y > U+06CC U+0651 ;;SHADDA #-ASM y > U+06CC ; yeh in persian etc. > iso and final shapes have no dots #ASMv P I P I / _ # > U+0650 U+06CC U+0651 ;; SHADDA #ASMv P I / _ # > U+0650 U+06CC ;; workaround to have ya without dots in Arabic #ASMn P I P I / _ # > U+06CC U+0651 ;; SHADDA #ASMn P I / _ # > U+06CC ;; workaround to have ya without dots in Arabic #ASM y y > U+064A U+0651 ;;SHADDA #ASM y > U+064A #B v v > U+06A5 U+0651 ;;SHADDA #B v > U+06A5 ;; maghribi #M v v > U+06CF U+0651 ;;SHADDA #M v > U+06CF #-MB v v > U+06A4 U+0651 ;;SHADDA #-MB v > U+06A4 W > U+0648 U+0627 ; waw + alif C U > U+064F U+0648 U+0653; historical spelling : madda on waw #n Q U [Uw] > U+064F U+0648 U+0651 ;;SHADDA #n Q U > U+064F U+0648 #n [Uw] [Uw] > U+0648 U+0651 ;;SHADDA #n [Uw] > U+0648 C w C w > U+06C9 U+0651 ;;SHADDA C w > U+06C9 CO w CO w > U+06CA U+0651 ;,SHADDA CO w > U+06CA #n Q u N > U+064C #n u N > #n Q u > U+064F #v U A / _ # > U+064F U+0648 U+0627 #v Q U [Uw] > U+0648 U+0651 ;;SHADDA #v Q U > U+0648 #v U [Uw] > U+064F U+0648 U+0651 ;;SHADDA #v U > U+064F U+0648 #v w w > U+0648 U+0651 ;;SHADDA #v w > U+0648 #v Q u N? > #v u N > U+064C #v u > U+064F #S V t h V t h > U+067A U+0651 ;;SHADDA #S V t h > U+067A #S t h t h > U+067F U+0651 ;;SHADDA #S t h > U+067F #S V t V t > U+067D U+0651 ;;SHADDA #S V t > U+067D #T V t V t > U+067C U+0651 ;;SHADDA #T V t > U+067C ; pashto #-TS V t V t > U+0679 U+0651 ;;SHADDA #-TS V t > U+0679 ;urdu UL (t|s) UL (t|s) > U+062B U+0651 ;;SHADDA UL (t|s) > U+062B (P t|U+1E6D) (P t|U+1E6D) > U+0637 U+0651 ;;SHADDA (P t|U+1E6D) > U+0637 t t > U+062A U+0651 ;;SHADDA t > U+062A #n T T Q AN > U+0629 U+0651 U+064B ;;SHADDA #n T Q AN > U+0629 U+064B #n T AN > U+0629 #v T T Q AN > U+0629 U+0651 ;;SHADDA #v T Q AN > U+0629 #v T AN > U+0629 U+064B T T > U+0629 U+0651 ;;SHADDA T > U+0629 C s C s > U+0634 U+0651 ;;SHADDA C s > U+0634 (P s|U+1E63) (P s|U+1E63) > U+0635 U+0651 ;;SHADDA (P s|U+1E63) > U+0635 #-O V s V s > U+069A U+0651 ;;SHADDA #-O V s > U+069A ;Pashto #O V s V s > U+0634 U+0651 ;;SHADDA #O V s > U+0634 CO s CO s > U+069B U+0651 ;;SHADDA CO s > U+069B ; sin with 3 dots below ## ;; ADD 069C-069F <<<<<<<<<< s s > U+0633 U+0651 ;;SHADDA s > U+0633 C r C r > U+06EF U+0651 ;;SHADDA ; missing in ArabTeX <<< C r > U+06EF CO r CO r > U+0697 U+0651 ;;SHADDA ; missing in ArabTeX <<< CO r > U+0697 ; Dargwa language, Dagestan P r P r > U+0694 U+0651 ;;SHADDA P r > U+0694 ; r with dot below; in Kurdish .r -> 0694 #S V r V r > U+0699 U+0651 ;;SHADDA #S V r > U+0699 #T V r V r > U+0693 U+0651 ;;SHADDA #T V r > U+0693 ;Pashto #-ST V r V r > U+0691 U+0651 ;;SHADDA #-ST V r > U+0691 ;Urdu r r > U+0631 U+0651 ;;SHADDA r > U+0631 P q P q > U+066F U+0651 ;;SHADDA P q > U+066F #-B q q > U+0642 U+0651 ;;SHADDA #-B q > U+0642 #B q q > U+06A7 U+0651 ;; maghribi #B q > U+06A7 ;; maghribi #S p h p h > U+06A6 U+0651 ;;SHADDA #S p h > U+06A6 #M p p > U+06A8 U+0651 ;;SHADDA #M p > U+06A8 ;Old malay #-M p p > U+067E U+0651 ;;SHADDA #-M p > U+067E CO n CO n > U+06B1 U+0651 ;;SHADDA CO n > U+06B1 #M C n C n > U+06BD U+0651 ;;SHADDA #M C n > U+06BD ; <<< ArabTeX yields nūn with three dots below, which is not in Unicode! #S C n C n > U+0683 U+0651 ;;SHADDA #S C n > U+0683 #-SM C n C n > U+06AD U+0651 ;;SHADDA #-SM C n > U+06AD ## U+06AE Berber <<< #S P P n P P n > U+06B2 U+0651 ;;SHADDA #S P P n > U+06B2 ;; old sindhi? (not in ArabTeX) <<< P n P n > U+06BA U+0651 ;;SHADDA P n > U+06BA ; urdu #S V n V n > U+06BB U+0651 ;;SHADDA #S V n > U+06BB #-S V n V n > U+06BC U+0651 ;;SHADDA #-S V n > U+06BC ; Pashto n n > U+0646 U+0651 ;;SHADDA n > U+0646 m m > U+0645 U+0651 ;;SHADDA m > U+0645 P l P l > U+06B6 U+0651 ; P l > U+06B6 ; ADDED <<< C l C l > U+06B5 U+0651 ;;SHADDA C l > U+06B5 ;kurdish ## U+06B7 U+06B8 U+06B9 missing <<< l l > U+0644 U+0651 ;;SHADDA l > U+0644 #S k h k h > U+06A9 U+0651 ;;SHADDA #S k h > U+06A9 #-O P k P k > U+06A9 U+0651 ;;SHADDA #-O P k > U+06A9 ; pashto urdu #O P k P k > U+0642 U+0651 ;;SHADDA #O P k > U+0642 ; Ottoman qaf UL k UL k > U+063A U+0651 ;;SHADDA UL k > U+063A #S k k > U+06AA U+0651 ;;SHADDA #S k > U+06AA ; kaf swash #-S k k > U+0643 U+0651 ;;SHADDA #-S k > U+0643 #-MO C g C g > U+062C U+0651 ;;SHADDA #-MO C g > U+062C #MO C g C g > U+06A0 U+0651 ;;SHADDA #MO C g > U+06A0 CO j CO j > U+0684 U+0651 ;;SHADDA CO j > U+0684 #O j j > U+0698 U+0651 ;;SHADDA #O j > U+0698 #-O j j > U+062C U+0651 ;;SHADDA #-O j > U+062C #n Q I > U+0650 U+064A #v Q I > U+064A #v I > U+0650 U+064A C I > U+0650 U+064A U+0653 ; historical spelling : madda on ya UL I > ; suggestion of O. Smrz (arabtex-plus project) URL = ### #n I > U+064A #n Q i N > U+064D #n i N > ; #n Q i > U+0650 #v Q i N? > ; #v i N > U+064D #v i > U+0650 x x > U+062E U+0651 ;;SHADDA x > U+062E #PT U+0048 Z > U+0647 U+200C U+0020 ;eH-suffix > eH suffix #PT U+0048 > U+0647 U+200C ; H is always heh final #U U+0048 > U+06C3 UL h UL h > U+062E U+0651 ;;SHADDA UL h > U+062E (P h|U+1E25) (P h|U+1E25) > U+062D U+0651 ;;SHADDA (P h|U+1E25) > U+062D V h V h > U+06C1 U+0651 ;;SHADDA V h > U+06C1 #U H H > U+06C1 U+0651 ;;SHADDA #U H > U+06C1 #U h h > U+06BE U+0651 ;;SHADDA #U h > U+06BE ; urdu #U ;; fix letter+h letter+h > letter+h shadda in URDU <<< #S h h > U+06BE U+0651 ;;SHADDA #S h > U+06BE #-US h h > U+0647 U+0651 ;;SHADDA #-US h > U+0647 #S P CO g P CO g > U+06B4 U+0651 ;;SHADDA #S P CO g > U+06B4 ;; old sindhi? (not in ArabTeX) <<< CO g CO g > U+06B3 U+0651 ;;SHADDA CO g > U+06B3 V g V g > U+06AC U+0651 ;;SHADDA V g > U+06AC ; kaf with dot above P g P g > U+063A U+0651 ;;SHADDA P g > U+063A #M g g > U+0762 U+0651 ;;SHADDA #M g > U+0762 #-M g g > U+06AF U+0651 ;;SHADDA #-M g > U+06AF G G > U+06AB U+0651 ;;SHADDA G > U+06AB ;pashto ## U+06B0 western punjabi <<< P f P f > U+06A1 U+0651 ;;SHADDA P f > U+06A1 #B f f > U+06A2 U+0651 ;; SHADDA #B f > U+06A2 ;; maghribi ## ingushi CO f > U+06A3 <<< #-B f f > U+0641 U+0651 ;;SHADDA #-B f > U+0641 #S V d h V d h > U+068D U+0651 ;;SHADDA #S V d h > U+068D #-TU V d V d > U+068A U+0651 ;;SHADDA #-TU V d > U+068A #T V d V d > U+0689 U+0651 ;;SHADDA #T V d > U+0689 ;pashto #U V d V d > U+0688 U+0651 ;;SHADDA #U V d > U+0688 ;urdu P V d P V d > U+068B U+0651 ;;SHADDA P V d > U+068B ;; (like U+0688 with dot below, for Lahnda = Western Punjabi) #S d h d h > U+068C U+0651 ;;SHADDA #S d h > U+068C #S C d C d > U+068E ;;SHADDA #S C d > U+068E ; old sindhi: not in ArabTeX #-S C d C d > U+06EE U+0651 ;;SHADDA #-S C d > U+06EE ; not defined in ArabTeX CO d CO d > U+068F U+0651 ;;SHADDA CO d > U+068F #U CO CO d CO CO d > U+0690 U+0651 ;;SHADDA #U CO CO d > U+0690 ; old urdu: not in ArabTeX UL (d|z) UL (d|z) > U+0630 U+0651 ;;SHADDA UL (d|z) > U+0630 (P d|U+1E0D) (P d|U+1E0D) > U+0636 U+0651 ;;SHADDA (P d|U+1E0D) > U+0636 d d > U+062F U+0651 ;;SHADDA d > U+062F #S C c h C c h > U+0687 U+0651 ;;SHADDA #S C c h > U+0687 #O V c V c > U+0686 U+0651 ;;SHADDA #O V c > U+0686 ; Ottoman ç #-O V c V c > U+0685 U+0651 ;;SHADDA #-O V c > U+0685 ;pashto C c C c > U+0686 U+0651 ;;SHADDA C c > U+0686 P C c P C c > U+06BF U+0651 ;;SHADDA P C c > U+06BF ;; cheh with dot above <<< #M c c > U+0686 U+0651 ;;SHADDA #M c > U+0686 #O c c > U+062C U+0651 ;;SHADDA #O c > U+062C #-MO c c > U+0681 U+0651 ;;SHADDA #-MO c > U+0681 ; ح with hamza above CO c CO c > U+0682 U+0651 ;;SHADDA CO c > U+0682 ; old pashto #S b h b h > U+0680 U+0651 ;;SHADDA #S b h > U+0680 CO b CO b > U+067B U+0651 ;;SHADDA CO b > U+067B P b P b > U+066E U+0651 ;;SHADDA P b > U+066E b b > U+0628 U+0651 ;;SHADDA b > U+0628 H A > U+0622 C A > U+064E U+0622 ; historical spelling #n UL A > U+0649 #v UL A > U+064E U+0649 UL a > U+0670 #A UL u > U+0657 ; inverted damma #A UL i > U+0656 ; subscript alif #-An UL u > ; #-An UL i > ; #-Av UL u > U+064F; #-Av UL i > U+0650; #n Q AN / (A H|Ha) _ > U+064B #n AN / (A H|Ha) _ > ; #v Q AN / (A H|Ha) _ > ; #v AN / (A H|Ha) _ > U+064B #n Q AN > U+064B U+0627 #n AN > U+0627 #n Q a > U+064E #n Q A > U+064E U+0627 #n A > U+0627 #v Q AN > U+0627 #v AN > U+064B U+0627 #v Q a > #v a > U+064E #v Q A > U+0627 #v A > U+064E U+0627 Ain Ain > U+0639 U+0651 ;;SHADDA Ain > U+0639 H H > U+0621 U+0651 ;;SHADDA H > U+0621 U+002C / [Digits] _ [Digits] > U+002C ; comma in numerical context not an Arabic comma Z Z / [Digits] _ [Digits] > U+2013 ; two hyphens in numerical context -> endash (Z|U+005D|U+005B|U+0028|U+0029)=xx / [Digits] _ [Digits] > U+202D @xx U+202C ; hyphen or brackets in numerical context: surrounded by LRO & PDF marks #-PUSKT [Digits] > [ArDigits] #PUSKT [Digits] > [ArEasternDigits] Z Z Z / # _ # > U+2014 Z Z / # _ # > U+2013 Z Z > U+0640 B > U+0640 Z / # _ > U+200D ; -x > force initial form with "zero-width joiner" Z / _ # > U+200D ; x- > force final form U+003C U+003C > U+00BB ;<< U+003E U+003E > U+00AB ;>> U+00AB > U+00BB U+00BB > U+00AB UL > U+0640 ;_ U+003F > U+061F ;? U+003B > U+061B ;; U+003A U+003A > U+061E ;; '::' ligature 'DOTS' also defined above... U+002C > U+060C ;, U+005D / [LTR] _ [LTR] > U+200D U+005D U+200D ; keep shaping when inserting ...[...]... ... U+005B / [LTR] _ [LTR] > U+200D U+005B U+200D ; (but of course this screws up contextual ; analysis, so quoting of hamzas etc may be necessary ;U+005D > U+005B ;] these are automatically mirrored, following Unicode rules ;U+005B > U+005D ;[ #n [VWL] > ;stripoff vowels #v Q [VWL] > ;stripoff quoted vowels #n Q > U+0652 @v Q > U+0652 U+00B0 > U+0652 ; ° N > ; C > ;^ #n Z > ;- BAR > ;| #U P P > U+06D4 ; urdu full stop << check #v pass(Unicode) #v #v Class [CONS] = (U+0621 U+0623 U+0624 U+0626 U+0628 U+062A..U+063A U+0641..U+0647 U+066E U+066F U+0672 U+0675 U+0676 U+0678..U+06BF U+06EE U+06EF U+06FA..U+06FF U+0750..U+076D) ; to be improved: not sure that all glyphs listed (esp for Urdu,Sindhi,etc are really "consonants" in the sense that they can carry a sukun, not to mention that some of these languages may not have a sukun at all ;-) #v Class [VWLX] = (U+0627 U+0648..U+065E U+0670) ; long vowels, vowel signs, shadda, sukun etc. #v Class [VWL] = (U+0627 U+0648..U+0650 U+0670); #v #f [CONS]=k1 / _ Z? [CONS] ^U+0651 > @k1 U+0652 ; hack: 2nd consonant has no shadda #f U+064E (U+064A|U+0648)=hc / _ (^[VWLX]|#) > U+064E @hc U+0652 ; ay and aw diphtongs @Av U+0627 [VWL]? / [VWL] Z _ > U+0627 ; e.g. fa-isti_hraj -> initial alif has no vowel @Bv U+0627 [VWL]? / [VWL] Z _ > U+0627 ; e.g. fa-isti_hraj -> initial alif has no vowel #Af U+0627 [VWL]? / [VWL] Z _ > U+0671 ; e.g. fa-isti_hraj -> wasla on initial alif #f;; U+0627 / [VWL] U+0020 _ U+0644 > U+0671 ; wasla on initial alif-lam >>>THIS DOES NOT WORK<<< implemented in first pass instead... #f U+0627 / # _ U+0644 > U+0627 U+064E ; otherwise initial alif takes fatha #f [CONS]=k1 / _ ([CONS]|#) > @k1 U+0652 ; consonant + (consonant or final) #f ;; [CONS]=k1 / _ # > @k1 U+0652 ; final consonant U+FDF2 > U+0644 U+0644 U+0647 ; #f Q > ; #v Z > ;