生成AI(GEMINI)を使用して、メールの要約

やりたいこと.

  • ゆるい規則性をもった文章を生成AI(GEMINI)で処理

  • サマリーをHTMLに記載.

    • jinja2を使用


コード

gemini__news_messages.py

import os, sys, re, glob, json5, tqdm
import googletrans
import extract_msg
import datetime
import google.generativeai as genai

# -- make "GEMINI_API_KEY" os environment variable 
#  $ export GEMINI_API_KEY="api key" 

# ========================================================= #
# ===  gemini__news_messages                            === #
# ========================================================= #
def gemini__news_messages():

    path             = "msg/*.msg"
    databaseFile     = "dat/database.json"
    
    sentMail_pattern = re.compile(
        r"^From:.*?\n"
        r"^Sent:.*?\n"
        r"^To:.*?\n"
        r"^Subject:.*?$",
        re.MULTILINE
    )
    
    # ------------------------------------------------- #
    # --- [1] search filename                       --- #
    # ------------------------------------------------- #
    filenames = glob.glob( path.lower() )
    tr        = googletrans.Translator()
    stack     = {}

    # ------------------------------------------------- #
    # --- [2] gemini configuration                  --- #
    # ------------------------------------------------- #
    # Gemini APIの設定
    # export GEMINI_API_KEY="api key"
    my_api_key = os.environ.get( "GEMINI_API_KEY" )
    genai.configure( api_key=my_api_key )
    model      = genai.GenerativeModel()

    # ------------------------------------------------- #
    # --- [3] summarize content                     --- #
    # ------------------------------------------------- #
    for ik, filename in enumerate( tqdm.tqdm( filenames, desc="Processing files" ) ):
        IDno       = "{:06}".format(ik+1)
        msg        = extract_msg.Message( filename )
        date       = ( msg.date ).strftime( '%Y/%m/%d' )
        subject_en = re.sub(r'^(?:\s*(RE|FW|FWD):\s*)+', '', msg.subject, flags=re.IGNORECASE)
        subject_en = re.sub(r'^\s*\[\!\]\s*'           , '', subject_en )
        subject_ja = ( tr.translate( subject_en, dest="ja", src="en" ) ).text
        body       = msg.body
        match      = sentMail_pattern.search( body )
        if match:
            body   = ( body[:match.start()] ).strip()
        body       = re.sub( r'\s+', ' ', body ).strip()
        prompt     = \
            f"""
            次の転送メール本文について、以下の5項目を抽出し、それぞれに対応するタグを付けてください。
            [opinion]: 転送者による私見(本文の冒頭にあることが多い)。挨拶や署名を除いて要点のみを残してください。
            [content]: 私見以外の引用・記事の本文を全文.
            [lang_ja]: 引用・記事部分の日本語訳文.
            [summary]: 記事の要点を日本語3行で要約.各行は '* ' で始め、末尾は改行.
            [nuclide]: 記事中に登場する放射性核種を元素記号と質量数の形式(e.g.:Ra-226, Ac-225)で列挙.質量数がなければ元素記号のみ.(カンマ区切り)
            [company]: 記事中に登場する会社名を列挙.ただし、送信者:「日立」「日立ハイテク」は除外.(カンマ区切り)
            出力は、上記の各タグの後にそれぞれ記載してください.
            メール本文:\n
            {body}
            """
        response      = model.generate_content( prompt )
        texts         = response.text
        pattern1      = "\[opinion\](.*)\[content\]"
        pattern2      = "\[content\](.*)\[lang_ja\]"
        pattern3      = "\[lang_ja\](.*)\[summary\]"
        pattern4      = "\[summary\](.*)\[nuclide\]"
        pattern5      = "\[nuclide\](.*)\[company\]"
        pattern6      = "\[company\](.*)"
        opinion       = re.search( pattern1, texts, flags=re.DOTALL )
        content_en    = re.search( pattern2, texts, flags=re.DOTALL )
        content_ja    = re.search( pattern3, texts, flags=re.DOTALL )
        summary       = re.search( pattern4, texts, flags=re.DOTALL )
        nuclide       = re.search( pattern5, texts, flags=re.DOTALL )
        company       = re.search( pattern6, texts, flags=re.DOTALL )
        opinion       = (    opinion.group(1) ).strip() if opinion    else ""
        content_en    = ( content_en.group(1) ).strip() if content_en else ""
        content_ja    = ( content_ja.group(1) ).strip() if content_ja else ""
        summary       = (    summary.group(1) ).strip() if summary    else ""
        nuclide       = (    nuclide.group(1) ).strip() if nuclide    else ""
        company       = (    company.group(1) ).strip() if company    else ""
        info          = { "id":IDno, "filename":filename, "date":date, \
                          "subject_en":subject_en, "subject_ja":subject_ja, \
                          "opinion":opinion, "content_en":content_en, "content_ja":content_ja, \
                          "summary":summary, "nuclide":nuclide, "company":company }
        stack[ IDno ] = info

    # ------------------------------------------------- #
    # --- [4] sort and renumber                     --- #
    # ------------------------------------------------- #
    sorted_items = sorted( stack.items(), key=lambda x: datetime.datetime.strptime( x[1]['date'], '%Y/%m/%d') )
    stack_       = {}
    for ik, ( _, item ) in enumerate( sorted_items, 1 ):
        IDno            = "{:06}".format( ik )
        item["No."]     = IDno
        stack_[ IDno  ] = item
    stack = stack_
        
    # ------------------------------------------------- #
    # --- [5] save as json file                     --- #
    # ------------------------------------------------- #
    with open( databaseFile, "w" ) as f:
        json5.dump( stack, f, ensure_ascii=False )
    return( stack )
        
        
# ========================================================= #
# ===   Execution of Pragram                            === #
# ========================================================= #

if ( __name__=="__main__" ):
    gemini__news_messages()

generate__html_pages.py

import os, shutil
import json5
import jinja2

# ========================================================= #
# ===  generate__html_pages.py                          === #
# ========================================================= #

def generate__html_pages():

    # ------------------------------------------------- #
    # --- [1]  データ読み込み                       --- #
    # ------------------------------------------------- #
    with open("dat/database.json", "r", encoding="utf-8") as f:
        data = json5.load(f)

    # ------------------------------------------------- #
    # --- [2]  ディレクトリ設定                     --- #
    # ------------------------------------------------- #
    #  --  出力先  -- #
    os.makedirs("html/", exist_ok=True)
    shutil.copy("templates/style.css", "html/style.css")
    
    #  -- Jinja環境 -- #
    env              = jinja2.Environment(loader=jinja2.FileSystemLoader("templates"))
    details_template = env.get_template("details--template.html")
    index_template   = env.get_template("index--template.html")
    
    # ------------------------------------------------- #
    # --- [3] 詳細ページ                            --- #
    # ------------------------------------------------- #
    for mail_id, mail in data.items():
        filename = f"html/{mail_id}.html"
        rendered = details_template.render(mail=mail)
        with open(filename, "w", encoding="utf-8") as f:
            f.write(rendered)
            print( f"output :: {filename}" )

    # ------------------------------------------------- #
    # --- [4] インデックスページ                    --- #
    # ------------------------------------------------- #
    #  -- [4-1] データ準備 -- #
    print( "\n ------------------------------------- " )
    mails = []
    for i, (mail_id, mail) in enumerate(data.items()):
        subject = "{0}<br>({1})".format( mail.get( "subject_en","" ), mail.get( "subject_ja","" ) )
        company = mail.get("company", "").replace( ",", ",<br>"     )
        summary = mail.get("summary", "").replace( "* ", "&#9670; " )
        nuclide = mail.get("nuclide", "").replace( ",", "<br>"      )
        mails.append( {
            "id": mail_id,
            "date": mail.get("date", ""),
            "subject": subject,
            "summary": summary, 
            "nuclide": nuclide,
            "company": company, 
        } )
    #  -- [4-2] HTML 出力 -- #
    indexFile = "html/index.html"
    with open( indexFile, "w", encoding="utf-8") as f:
        f.write( index_template.render( mails=mails ) )
    print( f"output :: {indexFile}" )
    print()


# ========================================================= #
# ===   Execution of Pragram                            === #
# ========================================================= #

if ( __name__=="__main__" ):
    generate__html_pages()

HTML テンプレート

style.css

/* ---------------------------------- */
/* ---        本文 ページ         --- */
/* ---------------------------------- */

body {
    font-family: -apple-system, BlinkMacSystemFont, "Helvetica Neue", "Noto Sans JP", sans-serif;
    font-size: 13.5px;
    line-height: 1.6;
    background-color: #fff;
    color: #222;
    margin: 2em auto;
    max-width: 1250px;
    padding: 0 1em;
}

h1 {
    font-size: 1.2em;
    border-bottom: 2px solid #666;
    padding-bottom: 0.3em;
    margin-bottom: 1.5em;
}

.table-outer {
    width: 100%;
    border-collapse: collapse;
    margin-bottom: 2em;
    table-layout: fixed;
}

.table-outer th, .table-outer td {
    border: 1px solid #ccc;
    padding: 0.6em 0.8em;
    vertical-align: top;
}

.table-outer th {
    background-color: #E0FFFF;
    width: 120px;
    font-weight: bold;
    text-align: center;
    white-space: nowrap;
}

.table-outer td {
    background-color: #fff;
    white-space: pre-wrap;
    word-wrap: break-word;
}

pre {
    margin: 0;
    font-family: "Helvetica Neue", monospace;
    font-size: 0.95em;
    white-space: pre-wrap;
}


/* ---------------------------------- */
/* ---    インデックスページ      --- */
/* ---------------------------------- */

table.index-table {
    width: 100%;
    table-layout: fixed;
    border-collapse: separate;
    border-spacing: 6px 4px;  /* 列に6px, 行に4px の余白 */
    table-layout: fixed;
    /* border-collapse: collapse; */
    /* border-spacing: 2 2px;     */
    margin-bottom: 2em;
}

.index-table th,
.index-table td {
    border: none;                  /* 列の罫線はすべて除去 */
    padding: 1.0em 1.0em;
    vertical-align: center;
    font-size: 13.5px;
    border-bottom: 1px solid #ccc;
}

.index-table th {
    background-color: #E0FFFF;
    font-weight: bold;
    white-space: nowrap;
}

.index-table td pre {
    margin: 0;
    white-space: pre-wrap;
    font-family: "Helvetica Neue", monospace;
    font-size: 0.95em;
}

details--template.html

<!DOCTYPE html>
<html lang="ja">
<head>
  <meta charset="UTF-8">
  <title>{{ mail.subject_ja }}</title>
  <link rel="stylesheet" href="style.css">
</head>
<body>
  <a href="index.html">一覧へ</a>
  <h1>{{ mail.subject_ja }}</h1>
  <table class="table-outer">
    <tr><th>ID</th><td>{{ mail.id }}</td></tr>
    <tr><th>Title</th><td>{{ mail.subject_en }}</td></tr>
    <tr><th>タイトル</th><td>{{ mail.subject_ja }}</td></tr>
    <tr><th>日付</th><td>{{ mail.date }}</td></tr>
    <tr><th>要約</th><td><pre>{{ mail.summary }}</pre></td></tr>
    <tr><th>核種</th><td>{{ mail.nuclide }}</td></tr>
    <tr><th>企業</th><td>{{ mail.company }}</td></tr>
    <tr><th>意見</th><td><pre>{{ mail.opinion }}</pre></td></tr>
    <tr><th>本文</th><td><pre>{{ mail.content_ja }}</pre></td></tr>
    <tr><th>Content</th><td><pre>{{ mail.content_en }}</pre></td></tr>
</table>
</body>
</html>

index--template.html

<!DOCTYPE html>
<html lang="ja">
  <head>
    <meta charset="UTF-8">
    <title>ニュース一覧</title>
    <link rel="stylesheet" href="style.css">
  </head>
  <body>
    <h1>ニュース一覧</h1>
    <table class="index-table">
      <tr>
        <th style="width:  8%;">日付</th>
        <th style="width: 30%;">題名</th>
        <th style="width: 40%;">要約</th>
        <th style="width:  7%;">核種</th>
        <th style="width: 15%;">企業</th>
      </tr>
      {% for item in mails %}
      <tr>
        <td style="width:  8%;">{{ item.date }}</td>
        <td style="width: 30%;"><a href="{{ item.id }}.html">{{ item.subject }}</a></td>
        <td style="width: 40%;"><pre>{{ item.summary }}</pre></td>
        <td style="width:  7%;">{{ item.nuclide }}</td>
        <td style="width: 15%;">{{ item.company }}</td>
      </tr>
      {% endfor %}
    </table>
  </body>
</html>