# encoding: utf-8

#
# Copyright (c) 2011 by ole <ole@ole.im>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#

#
# Print titles for URLs, only in the active buffer
# (this script requires WeeChat 0.3.0 or newer)
#
# History:
# 2011-09-11, ole <ole@ole.im>
#     version 0.2-dev: add youtube support
# 2011-09-09, ole <ole@ole.im>
#     version 0.1-dev: dev snapshot
#

ENABLE_YOUTUBE = True

import weechat as w
import re
try:
  import json
except ImportError:
  # Import used for Python versions prior to 2.6
  # *** You will have to download and install simplejson for this to work.
  try:
    import simplejson as json
  except ImportError:
    # if we can't get this either, don't read YouTube category from gdata.youtube.com
    ENABLE_YOUTUBE = False
    
from HTMLParser import HTMLParser
from urllib2 import urlopen

SCRIPT_NAME = 'url_title'
SCRIPT_AUTHOR  = "Ole Bergmann <ole@ole.im>"
SCRIPT_VERSION = "0.2"
SCRIPT_LICENSE = "GPL3"
SCRIPT_DESC    = "Output URL Titles written in any channel"


# colors, we dont want to screw up people's themes
COLOR_RESET = w.color('reset')
COLOR_TITLE = w.color("*default")
COLOR_LINK  = w.color('default')

# empty array which will be filled with urls to check
URLS = {}

# RegEx pattern to match urls
RE_URL = re.compile(r'(((http|ftp|https):\/\/|www\.)[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&amp;:/~\+#]*[\w\-\@?^=%&amp;/~\+#])?)')

# RegEx pattern to match title attribute of page
RE_TITLE = re.compile(r'<title.*>(.*?)<\/title>', re.I | re.S)

# RegEx pattern to match popular file extensions that should NOT be checked
RE_FILES = re.compile(r'.*(png|jpg|bmp|gif|avi|mpg|flv|3gp|mp4|exe|msi|mp3|flac|tar\.gz|tar\.bz2)$', re.I)

# RegEx used to match YouTube links
RE_YOUTUBE = re.compile('http://(www\.)?youtube\.com/watch\?v=([a-zA-Z0-9\-_]{10,13})', re.I)

'''
 This function attempts to retrieve the title from given page content
'''

def youtube_title(page):

  # parse json
  entry = json.loads(page.decode('utf-8'))['entry']
  # get category and video title
  vid = {'title': entry['title']['$t'], 'category': entry['media$group']['media$category'][0]['label']}

  # make sure the category is not bold formatted like the video title
  return '%s(%s)%s %s' % (COLOR_RESET, vid['category'], COLOR_TITLE, vid['title'])

def url_title(page):

  try:
    # search the page for a <title> attribute
    match = RE_TITLE.search(page)

    if match:

      # remove any whitespace we encounter (e.g. newlines) and replace it with a single space
      title = ' '.join(match.groups(0)[0].decode('utf-8').split())

      # init html parser
      h = HTMLParser()

      # if we found a title, return the title html decoded.
      if title: return h.unescape(title)
      else: return ''
  except AttributeError:
    return ''

'''
  Really simple function for appending an url to the URLS array
'''

def url_append(url, buffer = ""):
  
  global URLS

  URLS[url] = buffer

'''
  I don't like to do this, but it is impossible to run a background process
  otherwise within a weechat script. 

  This is instead of delaying the message we parsed the url from
'''

def url_process(url, command, rc, stdout, stderr):

  global URLS

  # make sure title is set
  title = ""
  
  # get the buffer object from the URLS array
  try:
    buffer = URLS[url]
  except KeyError:
    # probably already looked this one up, so just exit.
    return w.WEECHAT_RC_OK
  except IndexError:
    # probably already looked this one up, so just exit.
    return w.WEECHAT_RC_OK

  # unfortunately, we have to check up on this again,
  # since hook_process doesn't let us pass more than one variable:

  if ENABLE_YOUTUBE:
    match_yt = re.match(RE_YOUTUBE, url)
    if match_yt:
      # read from stdout, pass it to url_title
      title = youtube_title(stdout)
  
  # Not a YouTube link.
  if not title:
    # read from stdout, pass it to url_title
    title = url_title(stdout)
  
  if title:
    w.prnt(buffer,
      "+++\t%s%s %s%s- %s" % \
      (
       COLOR_TITLE,
       title,
       COLOR_RESET,
       COLOR_LINK,
       url,
      )
    )
  del URLS[url]
  return w.WEECHAT_RC_OK

def message_parse(data, signal, signal_data):

  # the server (to check which buffer the message belongs to)
  server = signal.split(",")[0]

  splits = signal_data.split(":")

  # the channel (to check which buffer the message belongs to)
  try:
    channel = splits[1].split(" ")[-2]
  except IndexError:
    # Don't check url titles on other than channels:
    return w.WEECHAT_RC_OK

  # the actual message
  message = ':'.join(splits[2:])

  # get the buffer the message was posted in
  buffer = w.info_get("irc_buffer", "%s,%s" % (server, channel))

  # get the current buffer
  current_buffer = w.current_buffer()

  # we only check for urls in the current buffer, so see if they match:
  if buffer == current_buffer:

    # search the message for any urls
    match = RE_URL.search(message)

    if match:
      
      # Great! we found one!
      url = match.groups(0)[0]

      # We don't want to download files and pictures.
      if not RE_FILES.match(url):

        # Assume http protocol if url matched with only the www. portion
        if url[:len("www.")] == "www.":
          url = "http://" + url

      	# append the url to URLS to make sure it knows which buffer it belongs to. 
        url_append(url, buffer)

        # by default only read 4096 first bytes of a webpage, youtube needs the whole thing for valid json though
        readBytes = "4096"

        # default, changes if we match a youtube link.
        lookup_url = url

        # look for youtube link if json is available:

        if ENABLE_YOUTUBE:
          match_yt = re.match(RE_YOUTUBE, url)
          if match_yt:
            lookup_url = "http://gdata.youtube.com/feeds/videos/%s?alt=json" % match_yt.groups(0)[1]
            # make sure we read the ENTIRE feed
            readBytes = ""

        # Check for python2 bin on systems where python3 is default
        python2_bin = w.info_get("python2_bin", "") or "python"
        cmd = python2_bin + " -c \"from urllib2 import urlopen; print(urlopen('%s').read(%s))\"" % (lookup_url, readBytes)

        # Wait 15 seconds before killing the process
        w.hook_process(cmd, 15 * 1000, "url_process", url)

  return w.WEECHAT_RC_OK

# only run if the script is not imported from another source:
if __name__ == "__main__":

  # attempt to register the script to weechat
  if w.register(SCRIPT_NAME, SCRIPT_AUTHOR, SCRIPT_VERSION, SCRIPT_LICENSE, SCRIPT_DESC, "", ""):

    # hook messages from buffers
    w.hook_signal("*,irc_in_privmsg", "message_parse", "")