Gecko [ myeducation.co.th ]

Name	Size	Permission
.vscode	[ DIR ]	drwxr-xr-x
__pycache__	[ DIR ]	drwxr-xr-x
explore	[ DIR ]	drwxr-xr-x
mara1_model	[ DIR ]	drwxr-xr-x
nltk_data	[ DIR ]	drwxr-xr-x
pythainlp_data	[ DIR ]	drwxr-xr-x
runs	[ DIR ]	drwxr-xr-x
venv	[ DIR ]	drwxr-xr-x
yolo8	[ DIR ]	drwxr-xr-x
. htaccess	417 B	-rwxr-xr-x
.htaccess	197 B	-rwxr-xr-x
app.log	66 B	-rwxr-xr-x
app.py	16.16 KB	-rwxr-xr-x
app.wsgi	568 B	-rwxr-xr-x
blog_ttt_website.py	5.07 KB	-rwxr-xr-x
config.py	485 B	-rwxr-xr-x
content_content_image_ram.py	11.07 KB	-rw-r--r--
content_main_ram.py	15.99 KB	-rw-r--r--
detect.py	8.16 KB	-rwxr-xr-x
explore.py	11.4 KB	-rw-r--r--
json_logfile.json	18 B	-rwxr-xr-x
keywords.json	203 B	-rw-r--r--
locations.json	299 B	-rw-r--r--
read_virus_files_php.py	2.6 KB	-rwxr-xr-x
table.csv	56.2 KB	-rw-r--r--
travel.py	6.89 KB	-rw-r--r--
travel_action.py	10.72 KB	-rw-r--r--
travel_mara1.py	12.6 KB	-rw-r--r--
travel_nltk_base.py	5.21 KB	-rw-r--r--
travel_pythainlp.py	9.37 KB	-rw-r--r--
udo systemctl daemon-reload	1.07 KB	-rwxr-xr-x
udo systemctl restart apache2	1.25 KB	-rwxr-xr-x
your_flask_service.log	1.07 KB	-rwxr-xr-x

Code Editor : travel_pythainlp.py

# /var/www/afra/py.afaa.website/travel_pythainlp.py

import re
import os
import textwrap
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import json
from collections import Counter
from mara1_model.combine_related_words import combine_related_words  # Import ฟังก์ชันใหม่
from mara1_model.get_stopwords import get_stopwords, PRESERVE_WORDS  # Import ฟังก์ชันและ PRESERVE_WORDS

try:
    import pythainlp
    from pythainlp.tokenize import sent_tokenize as thai_sent_tokenize, word_tokenize as thai_word_tokenize
    from pythainlp.corpus import thai_words
    from pythainlp.util import normalize
    from pythainlp.summarize import summarize
    from pythainlp.corpus.volubilis import thai_volubilis_words  # Import Volubilis function
    PYTHAINLP_AVAILABLE = True
    print("PyThaiNLP is available and will be used.")
except ImportError:
    print("PyThaiNLP is not installed. Falling back to basic processing.")
    PYTHAINLP_AVAILABLE = False
    thai_sent_tokenize = sent_tokenize
    thai_word_tokenize = word_tokenize
    normalize = lambda x: x
    thai_words = lambda: set()
    summarize = lambda x, n: [x]  # Fallback ถ้าไม่มี PyThaiNLP
    thai_volubilis_words = lambda: frozenset()  # Fallback for Volubilis

# Define Volubilis filename
_VOLUBILIS_FILENAME = "volubilis_words_th.txt"

def download_nltk_resources(download_dir=None):
    """Downloads necessary NLTK resources."""
    try:
        for resource in ['punkt_tab', 'stopwords']:
            try:
                nltk.data.find(f'tokenizers/{resource}', paths=[download_dir] if download_dir else None)
            except LookupError:
                print(f"Downloading NLTK resource: {resource} to {download_dir}...")
                nltk.download(resource, download_dir=download_dir, quiet=False)
    except Exception as e:
        print(f"Error downloading NLTK resources: {str(e)}")
        raise

if 'NLTK_DATA' not in os.environ:
    nltk_data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'nltk_data')
    os.environ['NLTK_DATA'] = nltk_data_path
    if not os.path.exists(nltk_data_path):
        os.makedirs(nltk_data_path)
    nltk.data.path.append(nltk_data_path)

download_nltk_resources(os.environ.get('NLTK_DATA'))

def clean_and_normalize_text(text):
    if text is None:
        return ""
    text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
    text = re.sub(r'<s>.*?</s>', '', text, flags=re.DOTALL | re.IGNORECASE)
    text = re.sub(r'<script.*?>.*?</script>', '', text, flags=re.DOTALL)
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'[ \t]+', ' ', text)
    text = re.sub(r'\n+', '\n', text)
    cleaned_text = text.strip()
    return normalize(cleaned_text) if PYTHAINLP_AVAILABLE else cleaned_text

def tokenize_sentences(text):
    if PYTHAINLP_AVAILABLE:
        try:
            return thai_sent_tokenize(text, engine="crfcut")
        except Exception as e:
            print(f"PyThaiNLP sentence tokenization error: {str(e)}")
    return [s.strip() for s in re.split(r'[.!?]+', text) if s.strip()]

def tokenize_words(text):
    if PYTHAINLP_AVAILABLE:
        try:
            tokens = thai_word_tokenize(text, engine="newmm", keep_whitespace=False)
            return [token for token in tokens if len(token.strip()) >= 2]
        except Exception as e:
            print(f"PyThaiNLP word tokenization error: {str(e)}")
    return text.split()

def analyze_sentiment(text):
    return "neutral"

def extract_locations(text):
    locations_file = '/var/www/afra/py.afaa.website/locations.json'
    try:
        with open(locations_file, 'r', encoding='utf-8') as f:
            predefined_locations = json.load(f)
    except FileNotFoundError:
        print(f"Warning: {locations_file} not found, using default locations.")
        predefined_locations = []
    words = tokenize_words(text)
    place_names = [word for word in words if word in predefined_locations]
    return list(set(place_names))

def format_paragraph(paragraph, keywords):
    sentences = tokenize_sentences(paragraph)
    formatted = ""
    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence:
            continue
        if formatted:
            formatted += " "
        formatted += sentence
    return formatted.strip()

def process_text(text, max_line_length=80):
    if not text or text.strip() == "":
        return {"processed_text": "", "formatted_text": "", "sentiment": "neutral", "processed_Keywords": []}

normalized_text = clean_and_normalize_text(text)
    sentences = tokenize_sentences(normalized_text)
    words = tokenize_words(normalized_text)

stop_words = get_stopwords()
    
    # Load Volubilis words if available
    volubilis_words = thai_volubilis_words() if PYTHAINLP_AVAILABLE else frozenset()

word_freq = Counter(words)
    filtered_keywords = [
        word for word in words 
        if word not in stop_words 
           and word.strip() 
           and (word in thai_words() or word in volubilis_words or len(word) >= 3) 
           and not re.match(r'^\d+[.,]?\d*$', word) 
           and re.match(r'^[ก-๙a-zA-Z]+$', word)
    ]
    keyword_scores = {word: word_freq[word] for word in set(filtered_keywords)}
    processed_keywords = sorted(keyword_scores.keys(), key=lambda x: keyword_scores[x], reverse=True)[:30]

# สร้าง processed_text
    processed_text = ""
    paragraphs = normalized_text.split("\n")
    for paragraph in paragraphs:
        paragraph = paragraph.strip()
        if not paragraph:
            continue
        # ตัดคำด้วย tokenize_words
        tokens = tokenize_words(paragraph)
        # กรอง stopwords (โดยที่คำใน PRESERVE_WORDS จะไม่ถูกกรองออก)
        filtered_tokens = [token for token in tokens if (token in PRESERVE_WORDS or token not in stop_words) and token.strip()]
        # รวมคำที่เกี่ยวข้องโดยใช้ thai_words และ related_pairs
        processed_words = combine_related_words(filtered_tokens)
        
        # ปรับปรุงการต่อคำสำหรับคำใน PRESERVE_WORDS ("ถึง")
        adjusted_words = []
        preserve_set = set(PRESERVE_WORDS.keys())
        i = 0
        while i < len(processed_words):
            current = processed_words[i]
            if current in preserve_set and i > 0 and i + 1 < len(processed_words):
                prev_word = adjusted_words[-1]
                next_word = processed_words[i + 1]
                # กรณีที่คำถัดไปเป็นตัวเลข: "อายุ ถึง 4,000 ปี" → "อายุถึง 4,000 ปี"
                if re.match(r'^\d+[.,]?\d*$', next_word):
                    adjusted_words[-1] = prev_word + current
                    # ไม่รวม next token เข้าด้วยกัน (เว้นวรรคตามปกติ)
                else:
                    # กรณีที่คำถัดไปเป็นข้อความ: "ย้อน ถึง ทวาราวดี" → "ย้อนถึงทวาราวดี"
                    adjusted_words[-1] = prev_word + current + next_word
                    i += 1  # ข้าม token ถัดไปที่ได้รวมแล้ว
            else:
                adjusted_words.append(current)
            i += 1

# ใช้ Volubilis เพื่อรวมคำที่ติดกันได้ (ลดการเว้นวรรค)
        final_words = []
        i = 0
        while i < len(adjusted_words):
            if i + 1 < len(adjusted_words):
                combined = adjusted_words[i] + adjusted_words[i + 1]
                if combined in volubilis_words:
                    final_words.append(combined)
                    i += 2  # ข้ามคำที่รวมแล้ว
                else:
                    final_words.append(adjusted_words[i])
                    i += 1
            else:
                final_words.append(adjusted_words[i])
                i += 1

# รวมคำใน final_words ด้วยช่องว่าง
        processed_paragraph = " ".join(final_words).strip()
        
        if processed_text:
            processed_text += "\n"
        if paragraph.startswith('•'):
            processed_text += f"<br/> {processed_paragraph}"
        else:
            processed_text += processed_paragraph

sentiment = analyze_sentiment(normalized_text)

# สร้าง formatted_text
    formatted_text = ""
    paragraphs = normalized_text.split("\n")
    for paragraph in paragraphs:
        paragraph = paragraph.strip()
        if not paragraph:
            continue
        proc_para = format_paragraph(paragraph, processed_keywords)
        if paragraph.startswith('•'):
            formatted_text += f"<p>{proc_para}</p>"
        else:
            formatted_text += f"<p>{proc_para}</p>"
        formatted_text += "<br/>"

for keyword in processed_keywords:
        formatted_text = re.sub(rf'\b({keyword})\b', r'<strong>\1</strong>', formatted_text, flags=re.IGNORECASE)

return {
        "processed_text": processed_text,
        "formatted_text": formatted_text.strip(),
        "sentiment": sentiment,
        "processed_Keywords": processed_keywords
    }

${this.title}

Code Editor : travel_pythainlp.py