Enable mp4 input in video pipeline

- Enable HW video decoder for video file src
- Adjust the render_size scaling mechanism
- Take care of stride in the inference frame
  (glbox generates stride as multiples of 4)

Currently we need to pass the decoded stream through mtkmdp, and
this plugin has many bugs (not supporting GL, format error to
rsvgoverlay,etc.). The POR is to use v4l2convert instead.

Bug: 151393184
Tested: edgetpu_demo --device
Tested: edgetpu_detect (with webcam)
Tested: edgetpu_classify (with image)
Change-Id: Ic448069c1a950e077a4ff62ba84aedffebbc9c31
diff --git a/debian/control b/debian/control
index c413cba..be6b4b6 100644
--- a/debian/control
+++ b/debian/control
@@ -14,6 +14,7 @@
          gir1.2-gst-plugins-base-1.0,
          gir1.2-gstreamer-1.0,
          gir1.2-gtk-3.0,
+         gst-mtkmdp,
          gstreamer1.0-gl,
          gstreamer1.0-plugins-bad,
          gstreamer1.0-plugins-good,
diff --git a/edgetpuvision/gstreamer.py b/edgetpuvision/gstreamer.py
index 57733ff..7d0d4ee 100644
--- a/edgetpuvision/gstreamer.py
+++ b/edgetpuvision/gstreamer.py
@@ -139,10 +139,11 @@
 def make_layout(inference_size, render_size):
     inference_size = Size(*inference_size)
     # render_size capped to 1280x720
-    render_size = Size(*render_size)
-    width, height = render_size
-    if height > 720:
-        render_size = render_size * 720 / height
+    input_size = Size(*render_size)
+    width, height = input_size
+    render_size = input_size * 720 / height
+    if (render_size.width > 1280):
+      render_size = input_size * 1280 / width
     size = min_outer_size(inference_size, render_size)
     window = center_inside(render_size, size)
     return Layout(size=size, window=window,
@@ -176,15 +177,16 @@
 def pull_sample(sink):
     sample = sink.emit('pull-sample')
     buf = sample.get_buffer()
+    meta = GstVideo.buffer_get_video_meta(buf)
 
     result, mapinfo = buf.map(Gst.MapFlags.READ)
     if result:
-        yield sample, mapinfo.data
+        yield sample, mapinfo.data, meta
     buf.unmap(mapinfo)
 
 def new_sample_callback(process):
     def callback(sink, pipeline):
-        with pull_sample(sink) as (sample, data):
+        with pull_sample(sink) as (sample, data, meta):
             process(data, caps_size(sample.get_caps()))
         return Gst.FlowReturn.OK
     return callback
@@ -212,7 +214,7 @@
         overlay.set_eos()
 
 def on_new_sample(sink, pipeline, render_overlay, layout, images, get_command):
-    with pull_sample(sink) as (sample, data):
+    with pull_sample(sink) as (sample, data, meta):
         custom_command = None
         save_frame = False
 
@@ -228,8 +230,13 @@
         else:
             custom_command = command
         # Read the data as an image before it is consumed by the model.
+        # Also take care of stride.
+        bpp = 3 # bytes per pixel
+        buf_stride = meta.stride[0]
         image_width, image_height = layout.inference_size
-        inference_img = Image.frombytes('RGB', (image_width, image_height), data, 'raw')
+        inf_stride = image_width * bpp
+        inference_img = Image.frombytes('RGB', (image_width, image_height), data, 'raw',
+                                        'RGB', buf_stride - inf_stride, 1)
         svg = render_overlay(inference_img,
                              command=custom_command)
 
@@ -264,27 +271,15 @@
 def get_pipeline(source, inference_size, display):
     fmt = parse_format(source)
     if fmt:
-        # Cap the render size at 720p
-        _, camera_height = fmt.size
-        render_size = fmt.size * 720 / camera_height
-        layout = make_layout(inference_size, render_size)
+        layout = make_layout(inference_size, fmt.size)
         return layout, camera_pipeline(fmt, layout, display)
 
     filename = os.path.expanduser(source)
     if os.path.isfile(filename):
-        # TODO: Revert this back to info.is_image() check
-        is_image = filename.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff',
-            '.tif', '.bmp', '.gif'))
-        if is_image:
-            info = get_video_info(filename)
-            render_size = Size(info.get_width(), info.get_height()) * 720 / info.get_height()
-        else:
-            # TODO: Remove YUV check
-            if not filename.lower().endswith(('.yuv')):
-                raise ValueError('Only support YUV raw videos')
-            render_size = Size(1280, 720)
+        info = get_video_info(filename)
+        render_size = Size(info.get_width(), info.get_height())
         layout = make_layout(inference_size, render_size)
-        return layout, file_pipline(is_image, filename, layout, display)
+        return layout, file_pipline(info.is_image(), filename, layout, display)
 
     return None
 
diff --git a/edgetpuvision/pipelines.py b/edgetpuvision/pipelines.py
index 38cd170..918454d 100644
--- a/edgetpuvision/pipelines.py
+++ b/edgetpuvision/pipelines.py
@@ -14,30 +14,26 @@
 
 from .gst import *
 
-def decoded_file_src(filename):
+#TODO: Use v4l2convert to run mdp.
+def decoded_file_src(filename, render_size):
     return [
         Source('file', location=filename),
-        #Filter('decodebin'),
-        Filter('videoparse width=960 height=540 format=4'),
-        Caps('video/x-raw,framerate=25/1'),
-        Filter('videoconvert'),
-        Caps('video/x-raw,format=I420'),
-        Filter('glfilterbin filter=glbox'),
-        Caps('video/x-raw,width=1280,height=720,format=BGRA'),
+        Filter('decodebin'),
+        Filter('mtkmdp width=%d height=%d format=BGRA' % (render_size.width, render_size.height)),
     ]
 
-#TODO: Remove this function when video codec is available
-def decoded_img_file_src(filename):
+#TODO: Remove this function when v4l2convert is available.
+def decoded_img_file_src(filename, render_size):
     return [
         Source('file', location=filename),
         Filter('decodebin'),
         Filter('videoconvert'),
         Caps('video/x-raw,format=I420'),
         Filter('glfilterbin filter=glbox'),
-        Caps('video/x-raw,height=720,format=BGRA'),
+        Caps('video/x-raw,width=%d,height=%d,format=BGRA' % (render_size.width, render_size.height)),
     ]
 
-def v4l2_src(fmt):
+def v4l2_src(fmt, render_size):
     return [
         Source('v4l2', device=fmt.device),
         # TODO: use YUV input when MIPI camera is ready
@@ -45,11 +41,12 @@
              framerate='%d/%d' % fmt.framerate),
         Filter('decodebin'),
         Filter('glfilterbin filter=glbox'),
-        Caps('video/x-raw', height=720, format='BGRA'),
+        Caps('video/x-raw', width=render_size.width, height=render_size.height, format='BGRA'),
     ]
 
 def display_sink():
     return [
+        Filter('videoconvert'),
         Filter('rsvgoverlay', name='svg_overlay'),
         Sink('wayland', name='glsink', sync=False)
     ]
@@ -68,7 +65,7 @@
 # Display
 def image_display_pipeline(filename, layout):
     return (
-        [decoded_img_file_src(filename),
+        [decoded_img_file_src(filename, layout.render_size),
          Filter('imagefreeze'),
          Caps('video/x-raw', framerate='30/1'),
          Tee(name='t')],
@@ -83,10 +80,10 @@
 
 def video_display_pipeline(filename, layout):
     return (
-        [decoded_file_src(filename),
+        [decoded_file_src(filename, layout.render_size),
          Tee(name='t')],
         [Pad('t'),
-         Queue(),
+         Queue(max_size_buffers=1, leaky='downstream'),
          display_sink()],
         [Pad('t'),
          Queue(max_size_buffers=1, leaky='downstream'),
@@ -95,7 +92,7 @@
 
 def camera_display_pipeline(fmt, layout):
     return (
-        [v4l2_src(fmt),
+        [v4l2_src(fmt, layout.render_size),
          Tee(name='t')],
         [Pad('t'),
          Queue(max_size_buffers=1, leaky='downstream'),
@@ -108,7 +105,7 @@
 # Headless
 def image_headless_pipeline(filename, layout):
     return (
-      [decoded_img_file_src(filename),
+      [decoded_img_file_src(filename, layout.render_size),
        Filter('imagefreeze'),
        Filter('glupload'),
        inference_pipeline(layout)],
@@ -116,14 +113,14 @@
 
 def video_headless_pipeline(filename, layout):
     return (
-        [decoded_file_src(filename),
+        [decoded_file_src(filename, layout.render_size),
          Filter('glupload'),
          inference_pipeline(layout)],
     )
 
 def camera_headless_pipeline(fmt, layout):
     return (
-        [v4l2_src(fmt),
+        [v4l2_src(fmt, layout.render_size),
          Filter('glupload'),
          inference_pipeline(layout)],
     )