pyRXP: 1.08 added windows-1252/cp-1252 encoding
authorrgbecker
Tue, 04 Jul 2006 14:23:04 +0000
changeset 2656 3e2a3ad2d595
parent 2655 c7333c9b3ed3
child 2657 ef0829fe7abc
pyRXP: 1.08 added windows-1252/cp-1252 encoding
rl_addons/pyRXP/pyRXP.c
rl_addons/pyRXP/rxp/charset.c
rl_addons/pyRXP/rxp/charset.h
rl_addons/pyRXP/rxp/input.c
rl_addons/pyRXP/rxp/rxp.c
rl_addons/pyRXP/rxp/stdio16.c
--- a/rl_addons/pyRXP/pyRXP.c	Mon Jul 03 10:11:30 2006 +0000
+++ b/rl_addons/pyRXP/pyRXP.c	Tue Jul 04 14:23:04 2006 +0000
@@ -21,7 +21,7 @@
 #include "stdio16.h"
 #include "version.h"
 #include "namespaces.h"
-#define VERSION "1.07"
+#define VERSION "1.08"
 #define MAX_DEPTH 256
 
 #if CHAR_SIZE==16
--- a/rl_addons/pyRXP/rxp/charset.c	Mon Jul 03 10:11:30 2006 +0000
+++ b/rl_addons/pyRXP/rxp/charset.c	Tue Jul 04 14:23:04 2006 +0000
@@ -17,9 +17,9 @@
 #include "charset.h"
 #include "string16.h"
 
-int iso_to_unicode[8][256];		/* latin-2 ... latin-9 */
-int iso_max_val[8];
-char8 *unicode_to_iso[8];
+int iso_to_unicode[NISO][256];		/* latin-2 ... latin-9 */
+int iso_max_val[NISO];
+char8 *unicode_to_iso[NISO];
 
 /* This table is used to initialise the above arrays */
 
@@ -153,6 +153,14 @@
 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x0131, 0x015f, 0x00ff,
 }
 };
+static int cp_1252_table[32]=
+{
+0x20ac, -00001, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
+0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, -00001, 0x017d, -00001,
+-00001, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
+0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, -00001, 0x017e, 0x0178,
+};
+
 
 const char8 *CharacterEncodingName[CE_enum_count] = {
     "unknown",
@@ -170,6 +178,7 @@
     "ISO-8859-7",
     "ISO-8859-8",
     "ISO-8859-9",
+	"CP-1252",
 
     "UTF-16",
     "UTF-16",
@@ -193,6 +202,7 @@
     "ISO-8859-7",
     "ISO-8859-8",
     "ISO-8859-9",
+	"CP-1252",
 
     "UTF-16-B",
     "UTF-16-L",
@@ -203,6 +213,7 @@
 struct character_encoding_alias CharacterEncodingAlias[] = {
     {"ASCII", CE_ISO_646},
     {"US-ASCII", CE_ISO_646},
+    {"WINDOWS-1252", CE_CP_1252},
     {"ISO-Latin-1", CE_ISO_8859_1},
     {"ISO-Latin-2", CE_ISO_8859_2},
     {"ISO-Latin-3", CE_ISO_8859_3},
@@ -220,9 +231,54 @@
 
 static int charset_initialised = 0;
 
+static int alloc_unicode_to_iso(int i, int max)
+{
+	if(!(unicode_to_iso[i] = Malloc(max+1))){
+	    fprintf(stderr, "Malloc failed in charset initialisation\n");
+	    return -1;
+		}
+	return 0;
+}
+
+static identity_iso_to_unicode(int i,int j,int jlim)
+{
+	for(; j<jlim; j++) iso_to_unicode[i][j] = j;
+}
+
+static table_to_iso_to_unicode(int i, int *table, int j0, int jlim, int *cmax)
+{
+	int j, max= *cmax;
+	for(j=j0; j<jlim; j++){
+	    int code = table[j-j0];
+	    iso_to_unicode[i][j] = code;
+	    if(code > max) max = code;
+		}
+	*cmax = max;
+}
+
+static identity_unicode_to_iso(int i,int j,int jlim)
+{
+	for(; j<jlim; j++) unicode_to_iso[i][j] = j;
+}
+
+static unknown_unicode_to_iso(int i,int j,int jlim)
+{
+	for(; j<jlim; j++) unicode_to_iso[i][j] = '?';
+}
+
+static table_to_unicode_to_iso(int i, int *table, int j0, int jlim)
+{
+	int j;
+	for(j=j0; j<jlim; j++){
+	    int code = table[j-j0];
+	    if(code != -1)
+		unicode_to_iso[i][code] = j;
+		}
+}
+
 int init_charset(void)
 {
-    int i, j;
+    int i, max;
     union {char b[2]; short s;} bytes;
 
     if(charset_initialised)
@@ -240,39 +296,32 @@
 #endif
 
     /* Make ISO-Latin-N tables */
-
-    for(i=0; i<8; i++)
-    {
-	int max = 0x9f;
+    for(i=0; i<8; i++){
+		max = 0x9f;
+		identity_iso_to_unicode(i,0,0xa0);
+		table_to_iso_to_unicode(i,latin_table[i],0xa0,0x100,&max);
 
-	for(j=0; j<0xa0; j++)
-	    iso_to_unicode[i][j] = j;
-	for(j=0xa0; j<0x100; j++)
-	{
-	    int code = latin_table[i][j-0xa0];
-	    iso_to_unicode[i][j] = code;
-	    if(code > max) max = code;
-	}
+		iso_max_val[i] = max;
+		if(alloc_unicode_to_iso(i,max)) return -1;
+		identity_unicode_to_iso(i,0,0xa0);
+		unknown_unicode_to_iso(i,0xa0,max);
+		table_to_unicode_to_iso(i,latin_table[i],0xa0,0x100);
+    	}
+
+
+	/*cp-1252*/
+	max = 0xff;
+	i = 8;
+	identity_iso_to_unicode(i,0,0x80);
+	table_to_iso_to_unicode(i,cp_1252_table,0x80,0xa0,&max);
+	identity_iso_to_unicode(i,0xa0,0x100);
 
 	iso_max_val[i] = max;
-
-	if(!(unicode_to_iso[i] = Malloc(max+1)))
-	{
-	    fprintf(stderr, "Malloc failed in charset initialisation\n");
-	    return -1;
-	}
-
-	for(j=0; j<0xa0; j++)
-	    unicode_to_iso[i][j] = j;
-	for(j=0xa0; j<=max; j++)
-	    unicode_to_iso[i][j] = '?';
-	for(j=0xa0; j<0x100; j++)
-	{
-	    int code = latin_table[i][j-0xa0];
-	    if(code != -1)
-		unicode_to_iso[i][code] = j;
-	}
-    }
+	if(alloc_unicode_to_iso(i,max)) return -1;
+	identity_unicode_to_iso(i,0,0x80);
+	unknown_unicode_to_iso(i,0x80,max);
+	identity_unicode_to_iso(i,0xa0,0x100);
+	table_to_unicode_to_iso(i,cp_1252_table,0x80,0xa0);
 
     return 0;
 }
@@ -280,13 +329,9 @@
 void deinit_charset(void)
 {
     int i;
-
-    if(!charset_initialised)
-	return;
+    if(!charset_initialised) return;
     charset_initialised = 0;
-
-    for(i=0; i<8; i++)
-	Free(unicode_to_iso[i]);
+    for(i=0; i<NISO; i++) Free(unicode_to_iso[i]);
 }
 
 /* Return true if the encoding has 8-bit input units and is the same
@@ -294,7 +339,7 @@
 
 int EncodingIsAsciiSuperset(CharacterEncoding enc)
 {
-    return enc >= CE_unspecified_ascii_superset && enc <= CE_ISO_8859_9;
+    return enc >= CE_unspecified_ascii_superset && enc <= CE_CP_1252;
 }
 
 /* 
--- a/rl_addons/pyRXP/rxp/charset.h	Mon Jul 03 10:11:30 2006 +0000
+++ b/rl_addons/pyRXP/rxp/charset.h	Tue Jul 04 14:23:04 2006 +0000
@@ -31,11 +31,12 @@
 
 enum character_encoding {
     CE_unknown, CE_unspecified_ascii_superset,
-    CE_UTF_8, CE_ISO_646, 
+    CE_UTF_8, CE_ISO_646,
     CE_ISO_8859_1,
 
     CE_ISO_8859_2, CE_ISO_8859_3, CE_ISO_8859_4, CE_ISO_8859_5,
     CE_ISO_8859_6, CE_ISO_8859_7, CE_ISO_8859_8, CE_ISO_8859_9,
+	CE_CP_1252,
 
     CE_UTF_16B, CE_UTF_16L, CE_ISO_10646_UCS_2B, CE_ISO_10646_UCS_2L, 
     CE_enum_count
@@ -57,10 +58,12 @@
 			CharacterEncoding *enc3);
 STD_API CharacterEncoding FindEncoding(char8 *name);
 
-/* Translation tables for Latin-N - do this right sometime! XXX */
+/* Translation tables for Latin-N and similar - do this right sometime! XXX */
+
+#define NISO 9
 
-extern STD_API int iso_to_unicode[8][256];
-extern STD_API int iso_max_val[8];
-extern STD_API char8 *unicode_to_iso[8];
+extern STD_API int iso_to_unicode[NISO][256];
+extern STD_API int iso_max_val[NISO];
+extern STD_API char8 *unicode_to_iso[NISO];
 
 #endif /* CHARSET_H */
--- a/rl_addons/pyRXP/rxp/input.c	Mon Jul 03 10:11:30 2006 +0000
+++ b/rl_addons/pyRXP/rxp/input.c	Tue Jul 04 14:23:04 2006 +0000
@@ -201,6 +201,7 @@
     case CE_ISO_8859_7:
     case CE_ISO_8859_8:
     case CE_ISO_8859_9:
+	case CE_CP_1252:
     case CE_unspecified_ascii_superset:
 	return s->bytes_before_current_line + s->next;
     case CE_UTF_8:
@@ -707,6 +708,7 @@
     case CE_ISO_8859_7:
     case CE_ISO_8859_8:
     case CE_ISO_8859_9:
+	case CE_CP_1252:
 	trans = translate_latin;
 	break;
     case CE_UTF_8:
--- a/rl_addons/pyRXP/rxp/rxp.c	Mon Jul 03 10:11:30 2006 +0000
+++ b/rl_addons/pyRXP/rxp/rxp.c	Tue Jul 04 14:23:04 2006 +0000
@@ -768,6 +768,7 @@
     case CE_ISO_8859_7:
     case CE_ISO_8859_8:
     case CE_ISO_8859_9:
+	case CE_CP_1252:
 	tablenum = (encoding - CE_ISO_8859_2);
 	return c <= iso_max_val[tablenum] && unicode_to_iso[tablenum][c] != '?';
 
--- a/rl_addons/pyRXP/rxp/stdio16.c	Mon Jul 03 10:11:30 2006 +0000
+++ b/rl_addons/pyRXP/rxp/stdio16.c	Tue Jul 04 14:23:04 2006 +0000
@@ -204,6 +204,7 @@
     case CE_ISO_8859_7:
     case CE_ISO_8859_8:
     case CE_ISO_8859_9:
+	case CE_CP_1252:
     case CE_unspecified_ascii_superset:
 	if(file->flags & FILE16_crlf)
 	{
@@ -311,6 +312,7 @@
     case CE_ISO_8859_7:
     case CE_ISO_8859_8:
     case CE_ISO_8859_9:
+	case CE_CP_1252:
 	tablenum = (file->enc - CE_ISO_8859_2);
 	max = iso_max_val[tablenum];
 	from_unicode = unicode_to_iso[tablenum];