生成AI(GEMINI)を使用して、メールの要約¶
やりたいこと.¶
ゆるい規則性をもった文章を生成AI(GEMINI)で処理
サマリーをHTMLに記載.
jinja2を使用
コード¶
gemini__news_messages.py¶
import os, sys, re, glob, json5, tqdm
import googletrans
import extract_msg
import datetime
import google.generativeai as genai
# -- make "GEMINI_API_KEY" os environment variable
# $ export GEMINI_API_KEY="api key"
# ========================================================= #
# === gemini__news_messages === #
# ========================================================= #
def gemini__news_messages():
path = "msg/*.msg"
databaseFile = "dat/database.json"
sentMail_pattern = re.compile(
r"^From:.*?\n"
r"^Sent:.*?\n"
r"^To:.*?\n"
r"^Subject:.*?$",
re.MULTILINE
)
# ------------------------------------------------- #
# --- [1] search filename --- #
# ------------------------------------------------- #
filenames = glob.glob( path.lower() )
tr = googletrans.Translator()
stack = {}
# ------------------------------------------------- #
# --- [2] gemini configuration --- #
# ------------------------------------------------- #
# Gemini APIの設定
# export GEMINI_API_KEY="api key"
my_api_key = os.environ.get( "GEMINI_API_KEY" )
genai.configure( api_key=my_api_key )
model = genai.GenerativeModel()
# ------------------------------------------------- #
# --- [3] summarize content --- #
# ------------------------------------------------- #
for ik, filename in enumerate( tqdm.tqdm( filenames, desc="Processing files" ) ):
IDno = "{:06}".format(ik+1)
msg = extract_msg.Message( filename )
date = ( msg.date ).strftime( '%Y/%m/%d' )
subject_en = re.sub(r'^(?:\s*(RE|FW|FWD):\s*)+', '', msg.subject, flags=re.IGNORECASE)
subject_en = re.sub(r'^\s*\[\!\]\s*' , '', subject_en )
subject_ja = ( tr.translate( subject_en, dest="ja", src="en" ) ).text
body = msg.body
match = sentMail_pattern.search( body )
if match:
body = ( body[:match.start()] ).strip()
body = re.sub( r'\s+', ' ', body ).strip()
prompt = \
f"""
次の転送メール本文について、以下の5項目を抽出し、それぞれに対応するタグを付けてください。
[opinion]: 転送者による私見(本文の冒頭にあることが多い)。挨拶や署名を除いて要点のみを残してください。
[content]: 私見以外の引用・記事の本文を全文.
[lang_ja]: 引用・記事部分の日本語訳文.
[summary]: 記事の要点を日本語3行で要約.各行は '* ' で始め、末尾は改行.
[nuclide]: 記事中に登場する放射性核種を元素記号と質量数の形式(e.g.:Ra-226, Ac-225)で列挙.質量数がなければ元素記号のみ.(カンマ区切り)
[company]: 記事中に登場する会社名を列挙.ただし、送信者:「日立」「日立ハイテク」は除外.(カンマ区切り)
出力は、上記の各タグの後にそれぞれ記載してください.
メール本文:\n
{body}
"""
response = model.generate_content( prompt )
texts = response.text
pattern1 = "\[opinion\](.*)\[content\]"
pattern2 = "\[content\](.*)\[lang_ja\]"
pattern3 = "\[lang_ja\](.*)\[summary\]"
pattern4 = "\[summary\](.*)\[nuclide\]"
pattern5 = "\[nuclide\](.*)\[company\]"
pattern6 = "\[company\](.*)"
opinion = re.search( pattern1, texts, flags=re.DOTALL )
content_en = re.search( pattern2, texts, flags=re.DOTALL )
content_ja = re.search( pattern3, texts, flags=re.DOTALL )
summary = re.search( pattern4, texts, flags=re.DOTALL )
nuclide = re.search( pattern5, texts, flags=re.DOTALL )
company = re.search( pattern6, texts, flags=re.DOTALL )
opinion = ( opinion.group(1) ).strip() if opinion else ""
content_en = ( content_en.group(1) ).strip() if content_en else ""
content_ja = ( content_ja.group(1) ).strip() if content_ja else ""
summary = ( summary.group(1) ).strip() if summary else ""
nuclide = ( nuclide.group(1) ).strip() if nuclide else ""
company = ( company.group(1) ).strip() if company else ""
info = { "id":IDno, "filename":filename, "date":date, \
"subject_en":subject_en, "subject_ja":subject_ja, \
"opinion":opinion, "content_en":content_en, "content_ja":content_ja, \
"summary":summary, "nuclide":nuclide, "company":company }
stack[ IDno ] = info
# ------------------------------------------------- #
# --- [4] sort and renumber --- #
# ------------------------------------------------- #
sorted_items = sorted( stack.items(), key=lambda x: datetime.datetime.strptime( x[1]['date'], '%Y/%m/%d') )
stack_ = {}
for ik, ( _, item ) in enumerate( sorted_items, 1 ):
IDno = "{:06}".format( ik )
item["No."] = IDno
stack_[ IDno ] = item
stack = stack_
# ------------------------------------------------- #
# --- [5] save as json file --- #
# ------------------------------------------------- #
with open( databaseFile, "w" ) as f:
json5.dump( stack, f, ensure_ascii=False )
return( stack )
# ========================================================= #
# === Execution of Pragram === #
# ========================================================= #
if ( __name__=="__main__" ):
gemini__news_messages()
generate__html_pages.py¶
import os, shutil
import json5
import jinja2
# ========================================================= #
# === generate__html_pages.py === #
# ========================================================= #
def generate__html_pages():
# ------------------------------------------------- #
# --- [1] データ読み込み --- #
# ------------------------------------------------- #
with open("dat/database.json", "r", encoding="utf-8") as f:
data = json5.load(f)
# ------------------------------------------------- #
# --- [2] ディレクトリ設定 --- #
# ------------------------------------------------- #
# -- 出力先 -- #
os.makedirs("html/", exist_ok=True)
shutil.copy("templates/style.css", "html/style.css")
# -- Jinja環境 -- #
env = jinja2.Environment(loader=jinja2.FileSystemLoader("templates"))
details_template = env.get_template("details--template.html")
index_template = env.get_template("index--template.html")
# ------------------------------------------------- #
# --- [3] 詳細ページ --- #
# ------------------------------------------------- #
for mail_id, mail in data.items():
filename = f"html/{mail_id}.html"
rendered = details_template.render(mail=mail)
with open(filename, "w", encoding="utf-8") as f:
f.write(rendered)
print( f"output :: {filename}" )
# ------------------------------------------------- #
# --- [4] インデックスページ --- #
# ------------------------------------------------- #
# -- [4-1] データ準備 -- #
print( "\n ------------------------------------- " )
mails = []
for i, (mail_id, mail) in enumerate(data.items()):
subject = "{0}<br>({1})".format( mail.get( "subject_en","" ), mail.get( "subject_ja","" ) )
company = mail.get("company", "").replace( ",", ",<br>" )
summary = mail.get("summary", "").replace( "* ", "◆ " )
nuclide = mail.get("nuclide", "").replace( ",", "<br>" )
mails.append( {
"id": mail_id,
"date": mail.get("date", ""),
"subject": subject,
"summary": summary,
"nuclide": nuclide,
"company": company,
} )
# -- [4-2] HTML 出力 -- #
indexFile = "html/index.html"
with open( indexFile, "w", encoding="utf-8") as f:
f.write( index_template.render( mails=mails ) )
print( f"output :: {indexFile}" )
print()
# ========================================================= #
# === Execution of Pragram === #
# ========================================================= #
if ( __name__=="__main__" ):
generate__html_pages()
HTML テンプレート¶
style.css¶
/* ---------------------------------- */
/* --- 本文 ページ --- */
/* ---------------------------------- */
body {
font-family: -apple-system, BlinkMacSystemFont, "Helvetica Neue", "Noto Sans JP", sans-serif;
font-size: 13.5px;
line-height: 1.6;
background-color: #fff;
color: #222;
margin: 2em auto;
max-width: 1250px;
padding: 0 1em;
}
h1 {
font-size: 1.2em;
border-bottom: 2px solid #666;
padding-bottom: 0.3em;
margin-bottom: 1.5em;
}
.table-outer {
width: 100%;
border-collapse: collapse;
margin-bottom: 2em;
table-layout: fixed;
}
.table-outer th, .table-outer td {
border: 1px solid #ccc;
padding: 0.6em 0.8em;
vertical-align: top;
}
.table-outer th {
background-color: #E0FFFF;
width: 120px;
font-weight: bold;
text-align: center;
white-space: nowrap;
}
.table-outer td {
background-color: #fff;
white-space: pre-wrap;
word-wrap: break-word;
}
pre {
margin: 0;
font-family: "Helvetica Neue", monospace;
font-size: 0.95em;
white-space: pre-wrap;
}
/* ---------------------------------- */
/* --- インデックスページ --- */
/* ---------------------------------- */
table.index-table {
width: 100%;
table-layout: fixed;
border-collapse: separate;
border-spacing: 6px 4px; /* 列に6px, 行に4px の余白 */
table-layout: fixed;
/* border-collapse: collapse; */
/* border-spacing: 2 2px; */
margin-bottom: 2em;
}
.index-table th,
.index-table td {
border: none; /* 列の罫線はすべて除去 */
padding: 1.0em 1.0em;
vertical-align: center;
font-size: 13.5px;
border-bottom: 1px solid #ccc;
}
.index-table th {
background-color: #E0FFFF;
font-weight: bold;
white-space: nowrap;
}
.index-table td pre {
margin: 0;
white-space: pre-wrap;
font-family: "Helvetica Neue", monospace;
font-size: 0.95em;
}
details--template.html¶
<!DOCTYPE html>
<html lang="ja">
<head>
<meta charset="UTF-8">
<title>{{ mail.subject_ja }}</title>
<link rel="stylesheet" href="style.css">
</head>
<body>
<a href="index.html">一覧へ</a>
<h1>{{ mail.subject_ja }}</h1>
<table class="table-outer">
<tr><th>ID</th><td>{{ mail.id }}</td></tr>
<tr><th>Title</th><td>{{ mail.subject_en }}</td></tr>
<tr><th>タイトル</th><td>{{ mail.subject_ja }}</td></tr>
<tr><th>日付</th><td>{{ mail.date }}</td></tr>
<tr><th>要約</th><td><pre>{{ mail.summary }}</pre></td></tr>
<tr><th>核種</th><td>{{ mail.nuclide }}</td></tr>
<tr><th>企業</th><td>{{ mail.company }}</td></tr>
<tr><th>意見</th><td><pre>{{ mail.opinion }}</pre></td></tr>
<tr><th>本文</th><td><pre>{{ mail.content_ja }}</pre></td></tr>
<tr><th>Content</th><td><pre>{{ mail.content_en }}</pre></td></tr>
</table>
</body>
</html>
index--template.html¶
<!DOCTYPE html>
<html lang="ja">
<head>
<meta charset="UTF-8">
<title>ニュース一覧</title>
<link rel="stylesheet" href="style.css">
</head>
<body>
<h1>ニュース一覧</h1>
<table class="index-table">
<tr>
<th style="width: 8%;">日付</th>
<th style="width: 30%;">題名</th>
<th style="width: 40%;">要約</th>
<th style="width: 7%;">核種</th>
<th style="width: 15%;">企業</th>
</tr>
{% for item in mails %}
<tr>
<td style="width: 8%;">{{ item.date }}</td>
<td style="width: 30%;"><a href="{{ item.id }}.html">{{ item.subject }}</a></td>
<td style="width: 40%;"><pre>{{ item.summary }}</pre></td>
<td style="width: 7%;">{{ item.nuclide }}</td>
<td style="width: 15%;">{{ item.company }}</td>
</tr>
{% endfor %}
</table>
</body>
</html>