# -*- Mode: Python -*-

# Author: Sam Rushing (http://www.nightmare.com/rushing)

# parse the hidden datecode from a sony HDV transport stream,
#   and use this information to split the stream into clips.
#
# is this some kind of standard?
# will it work with non-sony HDV streams?

# got a clue on where to look from the linux 'dvgrab' project,
# which refers to two sony private data fields tagged as 0xa0 and 0xa1.
# the stream with PID==2065 looks like an adaptation field...
#
# the BCD sets unused bits to one, which made it a little harder
#  to see the pattern...

# Motivation: I want to back up my HDV tapes.  However, I want to
#  back up the *original* mpeg-2 data, not some transcoded version.
#  However, my long-term backup solution will be to a hard drive.
#  The most useful filesystem to use on that drive is FAT32.
#    [for example, I can plug it into a PS3 and watch the video]
#  FAT32 can't store large files, so I need to split the 11G into
#    reasonably-sized chunks.
# On windows I would use HDVSplit.  By I don't want to run windows
#  just to split HDV files.
#
# 2009 03 31: had to put in a hack to try to cut off looooong clips
#   before they hit the 4GB FAT32 limit.

import os
import struct
import sys
import time

# 11:34:51 => 11:35:03
# d1b4d130 => 84b5d130
#
# 11010001 10110100 11010001 0011 0000
#   5   1     3  4     1   1
# 10000100 10110101 11010001 0011 0000
#   0   4     3  5     1   1              
#
#

def bcd (ch, mask0, mask1):
    return ((ord(ch)>>4) & mask0) * 10 + (ord(ch) & mask1)

def grok_hms (hms):
    ss = bcd (hms[0], 0x7, 0xf)
    mm = bcd (hms[1], 0x7, 0xf)
    hh = bcd (hms[2], 0x3, 0xf)
    return hh, mm, ss

# d4e706
# 11010100 11100111 00000110
#   dd        mm       yy 
#   14        07       06
# xx001111 xxx23333 44445555
#

def grok_ymd (ymd):
    dd = bcd (ymd[0], 0x3, 0xf)
    mm = bcd (ymd[1], 0x1, 0xf)
    yy = bcd (ymd[2], 0xf, 0xf)
    return 2000 + yy, mm, dd

class clip:
    # a clip, known by its start time
    def __init__ (self, time_tuple):
        self.time_tuple = time_tuple
        self.time_t = time.mktime (time_tuple)
        self.name = 'clip_%s.m2t' % (time.strftime ('%Y-%m-%d_%H_%M_%S', time_tuple),)
        self.file = open (self.name, 'wb')
        self.size = 0

    def write (self, packet):
        self.file.write (packet)
        self.size += len(packet)

    def close (self):
        self.file.close()
        # set the modification time to the timestamp
        os.utime (self.name, (time.time(), self.time_t))

MB = 1024 * 1024

LIMIT = 4 * 1024 * MB
BIG = (90 * LIMIT) / 100

def split_by_datecode (f, threshold, scan_only):
    last = 0
    n = 0
    fo = None
    with_timecode = False
    block = f.read (128)
    if block[0] == 'G':
        with_timecode = False
        packet_size = 188
    elif block[4] == 'G':
        with_timecode = True
        packet_size = 192
    else:
        raise ValueError ("doesn't look like an mpeg transport stream")
    f.seek (0)
    while 1:
        if with_timecode:
            # check timecode
            timecode = f.read (4)
        packet = f.read (188)
        n += packet_size
        if not packet:
            break
        elif packet[0] != 'G':
            break
        else:
            byte23 = struct.unpack ('>h', packet[1:3])[0]
            pid = byte23 & 0x1fff
            # uncomfortable with this hard-coded PID
            if pid == 2065:
                ymd = grok_ymd (packet[109:112])
                hms = grok_hms (packet[113:117])
                time_tuple = ymd + hms + (0, 0, -1)
                time_t = int (time.mktime (time_tuple))
                #sys.stderr.write ('%5d %s %d\n' % (n/MB, time.ctime (time_t), time_t))
                delta = time_t - last
                if delta > threshold or (fo and fo.size >= BIG):
                    sys.stderr.write ('%5dM %s\n' % (n/MB, time.ctime (time_t)))
                    if fo:
                        fo.close()
                    if not scan_only:
                        fo = clip (time_tuple)
                last = time_t
            if fo:
                fo.write (packet)
    if fo:
        fo.close()

if __name__ == '__main__':
    scan_only = '-s' in sys.argv
    split_by_datecode (
        open (sys.argv[1], 'rb'),
        # 15 minutes
        15 * 60,
        scan_only
        )
