diff options
Diffstat (limited to 'src/recode.c')
-rw-r--r-- | src/recode.c | 358 |
1 files changed, 213 insertions, 145 deletions
diff --git a/src/recode.c b/src/recode.c index 27dff92..ee9ac53 100644 --- a/src/recode.c +++ b/src/recode.c @@ -21,10 +21,17 @@ #define RCC_ACCEPTABLE_PROBABILITY 0 #define RCC_ACCEPTABLE_LENGTH 3 -static rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id class_id, const char *buf, size_t len, rcc_string *retstring) { +typedef enum rcc_detect_language_confidence_t { + RCC_DETECT_LANGUAGE_CONFIDENCE_UNSURE = 0, + RCC_DETECT_LANGUAGE_CONFIDENCE_ALMOST, + RCC_DETECT_LANGUAGE_CONFIDENCE_SURE, + RCC_DETECT_LANGUAGE_CONFIDENCE_CACHED +} rcc_detect_language_confidence; + +static rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id class_id, const char *buf, size_t len, rcc_string *retstring, rcc_detect_language_confidence *confidence) { rcc_speller speller = NULL; - unsigned long i, nlanguages; - rcc_language_config config, config0 = NULL; + long i, nlanguages; + rcc_language_config config, config0 = NULL, config1 = NULL; rcc_string recoded; unsigned char *utf8; size_t j, mode; @@ -48,6 +55,9 @@ static rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id c unsigned long k; rcc_language_id *parrents; size_t chars = 0; + char llang[RCC_MAX_LANGUAGE_CHARS]; + rcc_language_id locale_lang; + unsigned char defstep = 0; unsigned long accepted_nonenglish_langs = 0; @@ -61,6 +71,7 @@ static rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id c english_lang = rccStringGetLanguage(recoded); if (retstring) *retstring = recoded; else free(recoded); + if (confidence) *confidence = RCC_DETECT_LANGUAGE_CONFIDENCE_CACHED; return english_lang; } } @@ -72,17 +83,33 @@ static rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id c english_lang = rccGetLanguageByName(ctx, rcc_english_language_sn); - for (i=0;i<nlanguages;i++) { - if (i) config = rccGetUsableConfig(ctx, (rcc_language_id)i); - else config = rccGetCurrentConfig(ctx); - if (!config) continue; - + for (i=0;i<nlanguages;(defstep>1)?i++:i) { if (i) { - if (config==config0) continue; - } else config0=config; + config = rccGetUsableConfig(ctx, (rcc_language_id)i); + if ((!config)||(config==config0)||(config==config1)) continue; + } else { + switch (defstep) { + case 0: + config = rccGetCurrentConfig(ctx); + config0 = config; + break; + case 1: + if (!rccLocaleGetLanguage(llang ,ctx->locale_variable, RCC_MAX_LANGUAGE_CHARS)) { + locale_lang = rccGetLanguageByName(ctx, llang); + config = rccGetConfig(ctx, locale_lang); + } else config = NULL; + config1 = config; + break; + default: + config = NULL; + } + defstep++; + if ((!config)||(config0==config1)) continue; + } + if (bestfixlang != (rcc_language_id)-1) { - parrents = ctx->language_parrents[i]; + parrents = ((rcc_language_internal*)config->language)->parrents; for (k = 0;parrents[k] != (rcc_language_id)-1;k++) if (parrents[k] == bestfixlang) break; @@ -192,6 +219,8 @@ static rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id c if (english_string) free(english_string); if (retstring) *retstring = best_string; else if (best_string) free(best_string); + + if (confidence) *confidence = RCC_DETECT_LANGUAGE_CONFIDENCE_SURE; return bestlang; } @@ -199,6 +228,8 @@ static rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id c if (best_string) free(best_string); if (retstring) *retstring = english_string; else if (english_string) free(english_string); + + if (confidence) *confidence = RCC_DETECT_LANGUAGE_CONFIDENCE_SURE; return english_lang; } @@ -206,6 +237,8 @@ static rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id c if (english_string) free(english_string); if (retstring) *retstring = best_string; else if (best_string) free(best_string); + + if (confidence) *confidence = RCC_DETECT_LANGUAGE_CONFIDENCE_ALMOST; return bestlang; } @@ -213,6 +246,8 @@ static rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id c if (best_string) free(best_string); if (retstring) *retstring = english_string; else if (english_string) free(english_string); + + if (confidence) *confidence = RCC_DETECT_LANGUAGE_CONFIDENCE_ALMOST; return english_lang; } @@ -220,89 +255,152 @@ static rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id c if (english_string) free(english_string); if (retstring) *retstring = best_string; else if (best_string) free(best_string); + + if (confidence) *confidence = RCC_DETECT_LANGUAGE_CONFIDENCE_UNSURE; return bestlang; } else if (best_string) free(best_string); if ((english_res > RCC_ACCEPTABLE_PROBABILITY)&&(english_longest > RCC_ACCEPTABLE_LENGTH)) { if (retstring) *retstring = english_string; else if (english_string) free(english_string); + + if (confidence) *confidence = RCC_DETECT_LANGUAGE_CONFIDENCE_UNSURE; return english_lang; } else if (english_string) free(english_string); return (rcc_language_id)-1; } - rcc_language_id rccDetectLanguage(rcc_context ctx, rcc_class_id class_id, const char *buf, size_t len) { if (!ctx) { if (rcc_default_ctx) ctx = rcc_default_ctx; else return -1; } - return rccDetectLanguageInternal(ctx, class_id, buf, len, NULL); + return rccDetectLanguageInternal(ctx, class_id, buf, len, NULL, NULL); } -static rcc_autocharset_id rccConfigDetectCharsetInternal(rcc_language_config config, rcc_class_id class_id, const char *buf, size_t len) { - int err; - rcc_context ctx; - rcc_class_type class_type; - rcc_engine_ptr engine; - rcc_autocharset_id autocharset_id; - - if ((!buf)||(!config)) return (rcc_autocharset_id)-1; +static int rccIsParrentLanguage(rcc_language_config config, rcc_language_id parrent) { + unsigned int i; + rcc_language_id language; + rcc_language_id *list; + + language = rccConfigGetLanguage(config); + if (parrent == language) return 1; - ctx = config->ctx; + list = ((rcc_language_internal*)config->language)->parrents; + for (i=0;list[i] != (rcc_language_id)-1;i++) + if (list[i] == parrent) return 1; - err = rccConfigConfigure(config); - if (err) return (rcc_autocharset_id)-1; + return 0; +} + + +static int rccAreRelatedLanguages(rcc_language_config c1, rcc_language_config c2) { + rcc_language_id l1, l2; + + l1 = rccConfigGetLanguage(c1); + l2 = rccConfigGetLanguage(c2); - class_type = rccGetClassType(ctx, class_id); - if ((class_type != RCC_CLASS_FS)||((class_type == RCC_CLASS_FS)&&(rccGetOption(ctx, RCC_OPTION_AUTODETECT_FS_TITLES)))) { - rccMutexLock(config->mutex); - engine = rccConfigGetCurrentEnginePointer(config); - if ((engine)&&(engine->func)) autocharset_id = engine->func(&config->engine_ctx, buf, len); - else autocharset_id = (rcc_autocharset_id)-1; - rccMutexUnLock(config->mutex); - return autocharset_id; - } + if (rccIsParrentLanguage(c1, l2)) return 1; + if (rccIsParrentLanguage(c2, l1)) return 1; - return (rcc_autocharset_id)-1; + return 0; } -rcc_autocharset_id rccConfigDetectCharset(rcc_language_config config, rcc_class_id class_id, const char *buf, size_t len) { - return rccConfigDetectCharsetInternal(config, class_id, buf, len); -} +static char *rccRecodeTranslate(rcc_language_config *config, rcc_class_id class_id, const char *utfstring) { + rcc_context ctx; + rcc_language_config curconfig; + + rcc_option_value translate; + rcc_class_type ctype; + rcc_language_id language_id, english_language_id, current_language_id; -static int rccAreLanguagesRelated(rcc_context ctx, rcc_language_id l1, rcc_language_id l2, rcc_language_id skip) { - unsigned int i; - rcc_language_id *list; + char llang[RCC_MAX_LANGUAGE_CHARS]; - if ((l1 == skip)||(l2 == skip)) return 0; + rcc_translate trans, entrans; - if (l1 == l2) return 1; + char *translated; + + ctx = (*config)->ctx; + + translate = rccGetOption(ctx, RCC_OPTION_TRANSLATE); + if (translate == RCC_OPTION_TRANSLATE_OFF) return NULL; + + ctype = rccGetClassType(ctx, class_id); + if ((ctype != RCC_CLASS_TRANSLATE_LOCALE)&&(ctype != RCC_CLASS_TRANSLATE_CURRENT)&&(ctype != RCC_CLASS_TRANSLATE_FROM)) return NULL; + + language_id = rccConfigGetLanguage(*config); + + english_language_id = rccGetLanguageByName(ctx, rcc_english_language_sn); - list = ctx->language_parrents[l1]; - for (i=0;list[i] != (rcc_language_id)-1;i++) - if (list[i] == l2) return 1; + if (translate == RCC_OPTION_TRANSLATE_TO_ENGLISH) { + current_language_id = english_language_id ; + } else { + if (ctype == RCC_CLASS_TRANSLATE_LOCALE) { + if (!rccLocaleGetLanguage(llang ,ctx->locale_variable, RCC_MAX_LANGUAGE_CHARS)) + current_language_id = rccGetLanguageByName(ctx, llang); + else + current_language_id = (rcc_language_id)-1; + } else + current_language_id = rccGetCurrentLanguage(ctx); + } + + if (current_language_id == (rcc_language_id)-1) return NULL; + if (language_id == current_language_id) return NULL; - list = ctx->language_parrents[l2]; - for (i=0;list[i] != (rcc_language_id)-1;i++) - if (list[i] == l1) return 1; + curconfig = rccGetConfig(ctx, current_language_id); + if (!curconfig) return NULL; - return 0; + if (rccConfigConfigure(curconfig)) return NULL; + + if (translate == RCC_OPTION_TRANSLATE_SKIP_RELATED) { + if (rccAreRelatedLanguages(curconfig, *config)) return NULL; + } + + if (translate == RCC_OPTION_TRANSLATE_SKIP_PARRENT) { + if (rccIsParrentLanguage(curconfig, language_id)) return NULL; + } + + trans = rccConfigGetTranslator(*config, current_language_id); + if (trans) { + translated = rccTranslate(trans, utfstring); + if (translated) { + if ((!((rcc_language_internal*)curconfig->language)->latin)&&(rccIsASCII(translated))) { + free(translated); + translated = NULL; + } + } + } else translated = NULL; + + if ((!translated)&&(current_language_id != english_language_id)&&(!rccAreRelatedLanguages(*config, curconfig))) { + curconfig = rccGetConfig(ctx, english_language_id); + if (!curconfig) return NULL; + if (rccConfigConfigure(curconfig)) return NULL; + + entrans = rccConfigGetEnglishTranslator(*config); + if (entrans) translated = rccTranslate(entrans, utfstring); + } + + if (translated) *config = curconfig; + return translated; } rcc_string rccSizedFrom(rcc_context ctx, rcc_class_id class_id, const char *buf, size_t len) { int err; size_t ret; + rcc_language_config config; rcc_language_id language_id, detected_language_id; rcc_autocharset_id charset_id; rcc_iconv icnv = NULL; rcc_string result; + rcc_class_type class_type; rcc_option_value usedb4; const char *charset; + char *translate = NULL; + rcc_detect_language_confidence confidence; if (!ctx) { if (rcc_default_ctx) ctx = rcc_default_ctx; @@ -318,29 +416,38 @@ rcc_string rccSizedFrom(rcc_context ctx, rcc_class_id class_id, const char *buf, if (language_id == (rcc_language_id)-1) return NULL; if (!strcasecmp(ctx->languages[language_id]->sn, rcc_disabled_language_sn)) return NULL; - + class_type = rccGetClassType(ctx, class_id); usedb4 = rccGetOption(ctx, RCC_OPTION_LEARNING_MODE); -/* - if (usedb4&RCC_OPTION_LEARNING_FLAG_USE) { - result = rccDb4GetKey(ctx->db4ctx, buf, len); - if (result) { - if (rccStringFixID(result, ctx)) free(result); - else return result; - } - } - - if (rccGetOption(ctx, RCC_OPTION_AUTODETECT_LANGUAGE)) { - detected_language_id = rccDetectLanguageInternal(ctx, class_id, buf, len); - if (detected_language_id != (rcc_language_id)-1) - language_id = detected_language_id; - } -*/ - detected_language_id = rccDetectLanguageInternal(ctx, class_id, buf, len, &result); + detected_language_id = rccDetectLanguageInternal(ctx, class_id, buf, len, &result, &confidence); if (detected_language_id != (rcc_language_id)-1) { #ifdef RCC_DEBUG_LANGDETECT - printf("Language %i(%s): %s\n", rccStringGetLanguage(result), rccStringGetLanguage(result)?rccGetLanguageName(ctx, rccStringGetLanguage(result)):"", result); + printf("Language %i(%s): %s\n", rccStringGetLanguage(result), rccStringGetLanguage(result)?rccGetLanguageName(ctx, rccStringGetLanguage(result)):"", result); #endif /* RCC_DEBUG_LANGDETECT */ + + if ((result)&&(rccGetOption(ctx, RCC_OPTION_TRANSLATE))&&(class_type == RCC_CLASS_TRANSLATE_FROM)) { + rccMutexLock(ctx->mutex); + config = rccGetCurrentConfig(ctx); + translate = rccRecodeTranslate(&config, class_id, rccStringGetString(result)); + rccMutexUnLock(ctx->mutex); + + if (translate) { + language_id = rccConfigGetLanguage(config); + free(result); + result = rccCreateString(language_id, translate, 0); + } + } + + + if ((result)&& + (usedb4&RCC_OPTION_LEARNING_FLAG_LEARN)&& + (confidence!=RCC_DETECT_LANGUAGE_CONFIDENCE_CACHED)&& + ((language_id==detected_language_id)||(confidence!=RCC_DETECT_LANGUAGE_CONFIDENCE_UNSURE))&& + (!rccStringSetLang(result, ctx->languages[language_id]->sn))) { + + rccDb4SetKey(ctx->db4ctx, buf, len, result); + } + return result; } @@ -349,7 +456,8 @@ rcc_string rccSizedFrom(rcc_context ctx, rcc_class_id class_id, const char *buf, if (err) return NULL; rccMutexLock(ctx->mutex); - charset_id = rccDetectCharset(ctx, class_id, buf, len); + if (class_type == RCC_CLASS_KNOWN) charset_id = (rcc_autocharset_id)-1; + else charset_id = rccDetectCharset(ctx, class_id, buf, len); if (charset_id != (rcc_autocharset_id)-1) { icnv = ctx->iconv_auto[charset_id]; if (rccGetOption(ctx, RCC_OPTION_AUTOENGINE_SET_CURRENT)) { @@ -362,10 +470,24 @@ rcc_string rccSizedFrom(rcc_context ctx, rcc_class_id class_id, const char *buf, if (icnv) { ret = rccIConvInternal(ctx, icnv, buf, len); if (ret == (size_t)-1) return NULL; - result = rccCreateString(language_id, ctx->tmpbuffer, ret); + + if ((rccGetOption(ctx, RCC_OPTION_TRANSLATE))&&(rccGetClassType(ctx, class_id) == RCC_CLASS_TRANSLATE_FROM)) { + config = rccGetCurrentConfig(ctx); + translate = rccRecodeTranslate(&config , class_id, ctx->tmpbuffer); + if (translate) language_id = rccConfigGetLanguage(config); + } + + result = rccCreateString(language_id, translate?translate:ctx->tmpbuffer, translate?0:ret); } else { - result = rccCreateString(language_id, buf, len); + if ((rccGetOption(ctx, RCC_OPTION_TRANSLATE))&&(rccGetClassType(ctx, class_id) == RCC_CLASS_TRANSLATE_FROM)) { + config = rccGetCurrentConfig(ctx); + translate = rccRecodeTranslate(&config , class_id, buf); + if (translate) language_id = rccConfigGetLanguage(config); + } + + result = rccCreateString(language_id, translate?translate:buf, translate?0:len); } + rccMutexUnLock(ctx->mutex); if ((result)&&(usedb4&RCC_OPTION_LEARNING_FLAG_LEARN)) { @@ -385,13 +507,7 @@ char *rccSizedTo(rcc_context ctx, rcc_class_id class_id, rcc_const_string buf, s char *translated = NULL; rcc_language_config config; rcc_language_id language_id; - rcc_language_id current_language_id; - rcc_language_id english_language_id; rcc_class_type class_type; - rcc_option_value translate; - rcc_translate trans, entrans; - const char *langname; - unsigned char english_source; rcc_iconv icnv; if (!ctx) { @@ -414,74 +530,10 @@ char *rccSizedTo(rcc_context ctx, rcc_class_id class_id, rcc_const_string buf, s if (err) return NULL; class_type = rccGetClassType(ctx, class_id); - translate = rccGetOption(ctx, RCC_OPTION_TRANSLATE); - langname = rccGetLanguageName(ctx, language_id); - if (strcasecmp(langname, rcc_english_language_sn)) english_source = 0; - else english_source = 1; - - if ((class_type != RCC_CLASS_FS)&&((translate==RCC_OPTION_TRANSLATE_FULL)||((translate)&&(!english_source)))) { - english_language_id = rccGetLanguageByName(ctx, rcc_english_language_sn); - + if (((class_type == RCC_CLASS_TRANSLATE_LOCALE)||(class_type == RCC_CLASS_TRANSLATE_CURRENT))&&(rccGetOption(ctx, RCC_OPTION_TRANSLATE))) { rccMutexLock(ctx->mutex); - - current_language_id = rccGetCurrentLanguage(ctx); - if (current_language_id != language_id) { - if (translate != RCC_OPTION_TRANSLATE_TO_ENGLISH) { - trans = rccConfigGetTranslator(config, current_language_id); - if (trans) { - translated = rccTranslate(trans, utfstring); - if (translated) { - if ((current_language_id != english_language_id)&&(rccIsASCII(translated))) { - /* Ffrench to german (no umlauts) => not related - english to german (no umlauts) => skiping english relations - DS: Problem if we have relation between french and german */ - if (rccAreLanguagesRelated(ctx, language_id, current_language_id, english_language_id)) { - free(translated); - translated = NULL; - translate = 0; - } - } - } - if (translated) { - language_id = current_language_id; - - config = rccGetConfig(ctx, language_id); - if (!config) { - rccMutexUnLock(ctx->mutex); - free(translated); - return NULL; - } - - err = rccConfigConfigure(config); - if (err) { - rccMutexUnLock(ctx->mutex); - free(translated); - return NULL; - } - } - } - } - - if ((translate == RCC_OPTION_TRANSLATE_TO_ENGLISH)||((translate)&&(!translated)&&(!english_language_id == current_language_id)&&(!rccAreLanguagesRelated(ctx, language_id, current_language_id, (rcc_language_id)-1)))) { - entrans = rccConfigGetEnglishTranslator(config); - if (entrans) { - translated = rccTranslate(config->entrans, utfstring); -/* - config = rccGetConfig(ctx, language_id); - if (!config) { - rccMutexUnLock(ctx->mutex); - return translated; - } - - err = rccConfigConfigure(config); - if (err) { - rccMutexUnLock(ctx->mutex); - return translated; - }*/ - } - } - } + translated = rccRecodeTranslate(&config, class_id, utfstring); rccMutexUnLock(ctx->mutex); } @@ -492,7 +544,7 @@ char *rccSizedTo(rcc_context ctx, rcc_class_id class_id, rcc_const_string buf, s return result; } } - + rccMutexLock(ctx->mutex); rccMutexLock(config->mutex); icnv = config->iconv_to[class_id]; @@ -536,10 +588,14 @@ char *rccSizedRecode(rcc_context ctx, rcc_class_id from, rcc_class_id to, const if ((class_type == RCC_CLASS_FS)&&(rccGetOption(ctx, RCC_OPTION_AUTODETECT_FS_NAMES))) goto recoding; if (rccGetOption(ctx, RCC_OPTION_LEARNING_MODE)) goto recoding; if (rccGetOption(ctx, RCC_OPTION_AUTODETECT_LANGUAGE)) goto recoding; - if (rccGetOption(ctx, RCC_OPTION_TRANSLATE)) goto recoding; + if ((rccGetOption(ctx, RCC_OPTION_TRANSLATE))&&((class_type == RCC_CLASS_TRANSLATE_LOCALE)||(class_type == RCC_CLASS_TRANSLATE_CURRENT))) goto recoding; + + class_type = rccGetClassType(ctx, from); + if ((rccGetOption(ctx, RCC_OPTION_TRANSLATE))&&(class_type == RCC_CLASS_TRANSLATE_FROM)) goto recoding; rccMutexLock(ctx->mutex); - from_charset_id = rccDetectCharset(ctx, from, buf, len); + if (class_type == RCC_CLASS_KNOWN) from_charset_id = (rcc_autocharset_id)-1; + else from_charset_id = rccDetectCharset(ctx, from, buf, len); if (from_charset_id != (rcc_charset_id)-1) { from_charset = rccGetAutoCharsetName(ctx, from_charset_id); to_charset = rccGetCurrentCharsetName(ctx, to); @@ -606,6 +662,18 @@ char *rccFS(rcc_context ctx, rcc_class_id from, rcc_class_id to, const char *fsp rccMutexUnLock(config->mutex); rccMutexUnLock(ctx->mutex); } else result = NULL; + + if (!result) { + config = rccGetCurrentConfig(ctx); + if (config) { + rccMutexLock(ctx->mutex); + rccMutexLock(config->mutex); + result = rccFS3(config, to, prefix, rccStringGetString(string)); + rccMutexUnLock(config->mutex); + rccMutexUnLock(ctx->mutex); + } + } + free(string); } else result = NULL; |