summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/librcd.c35
-rw-r--r--src/librcd.h3
2 files changed, 37 insertions, 1 deletions
diff --git a/src/librcd.c b/src/librcd.c
index 55eefc0..36986cc 100644
--- a/src/librcd.c
+++ b/src/librcd.c
@@ -1,5 +1,7 @@
#include <stdio.h>
+#include "../config.h"
+
#define _LIBRCD_C
#include "librcd.h"
@@ -252,6 +254,36 @@ static int check_utf8(const unsigned char *buf, int len) {
return res;
}
+/* In russian language we will have whole word consisting of >127 characters,
+with latin languages there is in every word besides umlauts should exist at
+least one standard latin character with code < 127. */
+static int check_latin(const unsigned char *buf, int len) {
+ long i;
+ int word = 0;
+ int latin = 0;
+
+ for (i=0;i<len;i++) {
+ if (buf[i]<128) {
+ if (((buf[i]>='a')&&(buf[i]<='z'))||((buf[i]>='A')&&(buf[i]<='Z'))) {
+ // Latin character inside a word, so it isn't cyrillic word
+ latin++;
+ } else {
+ // Treating as a word separator.
+ if (word > 0) {
+ if (!latin) return 0;
+ if ((word/latin)>4) return 0;
+ }
+
+ word = 0;
+ latin = 0;
+ }
+ } else {
+ // Could be cyrillic word
+ if (word>=0) word++;
+ }
+ }
+ return 1;
+}
rcd_russian_charset rcdGetRussianCharset(const char *buf,int len) {
@@ -259,6 +291,9 @@ rcd_russian_charset rcdGetRussianCharset(const char *buf,int len) {
l = len?len:strlen(buf);
if (check_utf8(buf,l)>1) return RUSSIAN_CHARSET_UTF8;
+#ifdef DETECT_LATIN
+ if (check_latin(buf,l)) return RUSSIAN_CHARSET_LATIN;
+#endif /* DETECT_LATIN */
return is_win_charset2(buf,l);
}
diff --git a/src/librcd.h b/src/librcd.h
index 56db6c8..6fc3281 100644
--- a/src/librcd.h
+++ b/src/librcd.h
@@ -9,7 +9,8 @@ enum rcd_russian_charset_t {
RUSSIAN_CHARSET_WIN = 0,
RUSSIAN_CHARSET_KOI,
RUSSIAN_CHARSET_UTF8,
- RUSSIAN_CHARSET_ALT
+ RUSSIAN_CHARSET_ALT,
+ RUSSIAN_CHARSET_LATIN
};
typedef enum rcd_russian_charset_t rcd_russian_charset;