Helsinki-NLP/opus-mt-zh-en翻译小模型

1、手动下载模型到本地

2、安装必要的扩展库

3、demo：

import os  
import time  
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM  
import torch  
  
# 加载 tokenizer 和模型  
tokenizer = AutoTokenizer.from_pretrained("模型路径")  
model = AutoModelForSeq2SeqLM.from_pretrained("模型路径")  
  
# 设置设备  
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  
model.to(device)  
  
# 输入和输出目录  
input_dir = "输入目录路径"  
output_dir = "输出目录路径"  
  
# 确保输出目录存在  
os.makedirs(output_dir, exist_ok=True)  
  
def translate_text(text, tokenizer, model, device):  
    inputs = tokenizer.encode(text, return_tensors="pt").to(device)  
    translations = []  
      
    for i in range(0, len(inputs[0]), 512):  
        chunk = inputs[:, i:i + 512]  
        with torch.no_grad():  
            outputs = model.generate(chunk, max_length=512, num_beams=4, early_stopping=True)  
        translations.append(tokenizer.decode(outputs[0], skip_special_tokens=True))  
      
    return " ".join(translations)  
  
def translate_filename(filename, tokenizer, model, device):  
    name_without_ext = os.path.splitext(filename)[0]  
    translated_name = translate_text(name_without_ext, tokenizer, model, device)  
    translated_name = f"{translated_name}.txt"  
      
    translated_name = translated_name.replace("?", "").replace(":", "").replace("<", "").replace(">", "").replace("|", "").replace("*", "")  
      
    return translated_name  
  
# 统计信息  
start_time = time.time()  
file_count = 0  
  
# 遍历输入目录中的所有文件  
for filename in os.listdir(input_dir):  
    if filename.endswith(".txt"):  
        file_path = os.path.join(input_dir, filename)  
          
        try:  
            with open(file_path, "r", encoding="utf-8") as file:  
                text = file.read()  
              
            translated_text = translate_text(text, tokenizer, model, device)  
              
            translated_filename = translate_filename(filename, tokenizer, model, device)  
              
            output_file_path = os.path.join(output_dir, translated_filename)  
              
            with open(output_file_path, "w", encoding="utf-8") as file:  
                file.write(translated_text)  
              
            os.remove(file_path)  
              
            file_count += 1  
          
        except Exception as e:  
            print(f"处理文件 {filename} 时发生错误: {e}")  
  
end_time = time.time()  
elapsed_time = end_time - start_time  
average_time_per_file = elapsed_time / file_count if file_count > 0 else 0  
  
print(f"翻译完成，处理了 {file_count} 个文件。")  
print(f"总耗时: {elapsed_time:.2f} 秒")  
print(f"平均每篇文件翻译时间: {average_time_per_file:.2f} 秒")

demo解释：

加载模型和分词器：首先，它使用AutoTokenizer和AutoModelForSeq2SeqLM从指定的模型路径加载分词器和模型，使用预训练的序列到序列模型进行文本翻译任务。
设置设备：脚本检查CUDA是否可用，并据此将模型加载到GPU或CPU上用于加速翻译过程。
定义翻译函数：translate_text函数接受一段文本、分词器、模型和设备作为输入，使用模型生成翻译文本。
定义分割文本函数：split_text函数用于将长文本分割成多个较短的文本片段，以便它们可以单独翻译，因为模型的输入长度有限制，该模型限制512。
遍历待翻译文本文件：脚本遍历指定目录中的所有文本文件，对每个文件执行以下操作：
- 读取文件内容。
- 如果文本长度超过模型的最大输入长度，将其分割成多个片段并分别翻译，然后将翻译后的文本片段合并。
- 否则，直接翻译整个文本。
- 生成翻译后的文件名，确保文件名不包含特定字符。
- 将翻译后的文本写入新文件，并删除原始文件。
错误处理：如果在处理文件时发生错误，脚本会捕获异常并打印错误信息。
完成提示：所有文件处理完成后，脚本打印一条消息表示翻译任务已完成。

测试翻译文本：

SEO优化全攻略：提升网站排名与流量的关键策略

SEO基础介绍：简要介绍SEO的概念、重要性以及基本原理，帮助读者建立对SEO的基本认识。
关键词研究：详细讲解如何进行关键词研究，包括选择关键词的工具、方法以及注意事项，帮助读者找到适合自己的关键词。
内容优化：强调内容质量在SEO中的重要性，介绍如何创作高质量、有价值的内容，并合理使用关键词，避免过度堆砌。
链接建设：讲解外部链接和内部链接对SEO的影响，以及如何通过链接建设提升网站的权威性和排名。
技术SEO：介绍网站结构、代码优化、加载速度、移动友好性和安全性等技术因素对SEO的影响，并提供相应的优化建议。
SEO监测与调整：强调SEO是一个持续的过程，需要定期监测和调整策略。介绍如何使用分析工具跟踪关键词排名、网站流量和用户行为，并根据数据进行策略调整。
SEO案例分享：通过实际案例分享SEO优化的成功经验和失败教训，帮助读者更好地理解SEO的实践应用。
通过以上几个方面的深入探讨，文章将为读者提供一份全面、实用的SEO优化指南，帮助他们提升网站的可见性和流量，实现业务增长。

执行模型翻译结果：

SEO optimized overall strategy a key strategy for upgrading website ranking and traffic

SEO Basic Introduction: Brief description of the concept, importance, and rationale of SEO. Keyword research: Detailed description of how to conduct keyword research, including tools, methods, and attention to select keywords. Content optimization: Highlighting the importance of content quality in SEO, describing how to create high-quality, valuable content, and rational use of keywords to avoid excessive stacking. Link building: Explaining the impact of external and internal links on SEO, and how to increase the authority and ranking of the website by building links.

由于超出512字符，自动截取掉了。

分段输出版本：

import os
import time
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# 模型路径
model_path = "模型路径"

# 加载 tokenizer 和模型
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

# 设置设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 输入和输出目录
input_dir = "输入目录"
output_dir = "输出目录"

# 确保输出目录存在
os.makedirs(output_dir, exist_ok=True)

def split_text_by_sentence(text, max_length):
    """按句子分割文本，确保不超过最大长度"""
    sentences = text.split('。')
    chunks = []
    chunk = ""
    for sentence in sentences:
        if len(chunk) + len(sentence) + 1 > max_length:
            chunks.append(chunk)
            chunk = sentence + '。'
        else:
            chunk += sentence + '。'
    
    if chunk:
        chunks.append(chunk)
    
    return chunks

def translate_text(text, tokenizer, model, device, max_length=512):
    translations = []
    chunks = split_text_by_sentence(text, max_length)
    for chunk in chunks:
        inputs = tokenizer.encode(chunk, return_tensors="pt", truncation=True, max_length=max_length).to(device)
        with torch.no_grad():
            outputs = model.generate(inputs, max_length=max_length, num_beams=4, early_stopping=True)
        translations.append(tokenizer.decode(outputs[0], skip_special_tokens=True))
      return "\n".join(translations)

def translate_filename(filename, tokenizer, model, device):
    name_without_ext = os.path.splitext(filename)[0]
    translated_name = translate_text(name_without_ext, tokenizer, model, device)
    translated_name = f"{translated_name}.txt"
    translated_name = translated_name.replace("?", "").replace(":", "").replace("<", "").replace(">", "").replace("|", "").replace("*", "")
    return translated_name
start_time = time.time()
file_count = 0
for filename in os.listdir(input_dir):
    if filename.endswith(".txt"):
        file_path = os.path.join(input_dir, filename)
        try:
            with open(file_path, "r", encoding="utf-8") as file:
                text = file.read()
            translated_text = translate_text(text, tokenizer, model, device)
            translated_filename = translate_filename(filename, tokenizer, model, device)
            output_file_path = os.path.join(output_dir, translated_filename)
            with open(output_file_path, "w", encoding="utf-8") as file:
                file.write(translated_text)
            os.remove(file_path)
            file_count += 1
        except Exception as e:
            print(f"处理文件 {filename} 时发生错误: {e}")
end_time = time.time()
elapsed_time = end_time - start_time
average_time_per_file = elapsed_time / file_count if file_count > 0 else 0
print(f"翻译完成，处理了 {file_count} 个文件。")
print(f"总耗时: {elapsed_time:.2f} 秒")
print(f"平均每篇文件翻译时间: {average_time_per_file:.2f} 秒")

测试速度：

翻译完成，处理了 5 个文件。
总耗时: 39.35 秒
平均每篇文件翻译时间: 7.87 秒

608

老猫

Helsinki-NLP/opus-mt-zh-en翻译小模型

发表回复取消回复

最新发的

几大分类

文章标签

Helsinki-NLP/opus-mt-zh-en翻译小模型

发表回复 取消回复

最新发的

几大分类

文章标签

发表回复取消回复