Add new edgetpuvision features.

* Better gstreamer pipelines
* Inference on image files
* Streaming from .mp4 file
* Streaming from any v4l2 source
* Switching between .tflite models in runtime
* tpudemo script
* Simplified API based on python generators

Change-Id: I95b8658977fe586ff5f595e75c84ee2e2c46756f
diff --git a/.gitignore b/.gitignore
index 5ce6616..52b8614 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
-dist
 build
+dist
+__pycache__
 *.egg-info
+.DS_Store
diff --git a/bin/edgetpu_demo b/bin/edgetpu_demo
new file mode 100755
index 0000000..fbab807
--- /dev/null
+++ b/bin/edgetpu_demo
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+#readonly VIDEO_FILE=""
+readonly EDGETPU_DIR=$(python3 -c 'import edgetpu; import os; print(os.path.dirname(edgetpu.__file__))')
+readonly TEST_DATA_DIR="${EDGETPU_DIR}/test_data"
+readonly TPU_MODEL_FILE="${TEST_DATA}/mobilenet_ssd_v2_coco_quant_postprocess_edgetpu.tflite"
+readonly LABELS_FILE="${TEST_DATA}/coco_labels.txt"
+
+if [ "$1" = "--device" ]; then
+  python3 -m edgetpuvision.detect \
+      --source "${VIDEO_FILE}" \
+      --model="${TPU_MODEL_FILE}" \
+      --labels="${LABELS_FILE}" \
+      --fullscreen
+elif [ "$1" = "--stream" ]; then
+  python3 -m edgetpuvision.detect_server \
+      --source "${VIDEO_FILE}" \
+      --model="${TPU_MODEL_FILE}" \
+      --labels="${LABELS_FILE}"
+else
+  echo "Run on-device inference:"
+  echo "  $0 --device"
+  echo "Run streaming server:"
+  echo "  $0 --stream"
+fi
diff --git a/edgetpuvision/camera.py b/edgetpuvision/camera.py
index d02c952..bcaa9b3 100644
--- a/edgetpuvision/camera.py
+++ b/edgetpuvision/camera.py
@@ -1,3 +1,4 @@
+import os
 import threading
 
 import numpy as np
@@ -5,77 +6,14 @@
 from . import gstreamer
 from .gst import *
 
-
-def inference_pipeline(render_size, inference_size):
-    size = max_inner_size(render_size, inference_size)
-    return (
-        Filter('glfilterbin', filter='glcolorscale'),
-        Caps('video/x-raw', format='RGBA', width=size.width, height=size.height),
-        Filter('videoconvert'),
-        Caps('video/x-raw', format='RGB', width=size.width, height=size.height),
-        Filter('videobox', autocrop=True),
-        Caps('video/x-raw', width=inference_size.width, height=inference_size.height),
-        Filter('appsink', name='appsink', emit_signals=True, max_buffers=1, drop=True, sync=False)
-    )
-
-
-def file_streaming_pipeline(filename, render_size, inference_size):
-    return (
-        Filter('filesrc', location=filename),
-        Filter('qtdemux'),
-        Filter('h264parse', config_interval=-1),
-        Caps('video/x-h264', stream_format='byte-stream', profile='baseline', alignment='nal'),
-        Tee(pins=((
-          Queue(),
-          Filter('vpudec'),
-          inference_pipeline(render_size, inference_size),
-        ), (
-          Queue(),
-          Filter('appsink', name='h264sink', emit_signals=True, max_buffers=1, drop=False, sync=False),
-        )))
-    )
-
-
-def camera_streaming_pipeline(render_size, inference_size, profile, bitrate):
-    size = max_inner_size(render_size, inference_size)
-    return (
-        Filter('v4l2src', device='/dev/video1'),
-        Caps('video/x-raw', format='YUY2', width=640, height=360, framerate='15/1'),
-        Tee(pins=((
-          Queue(),
-          inference_pipeline(render_size, inference_size)
-        ), (
-          Queue(),
-          Filter('videoconvert'),
-          Filter('x264enc',
-                 speed_preset='ultrafast',
-                 tune='zerolatency',
-                 threads=4,
-                 key_int_max=5,
-                 bitrate=int(bitrate / 1000),  # kbit per second.
-                 aud=False),
-          Caps('video/x-h264', profile=profile),
-          Filter('h264parse'),
-          Caps('video/x-h264', stream_format='byte-stream', alignment='nal'),
-          Filter('appsink', name='h264sink', emit_signals=True, max_buffers=1, drop=False, sync=False),
-          # Tee(pins=((
-          #     Queue(),
-          #     Filter('appsink', name='h264sink', emit_signals=True, max_buffers=1, drop=False, sync=False)
-          # ),(
-          #     Queue(),
-          #     Filter('vpudec'),
-          #     Filter('kmssink', sync=False)
-          # )))
-        )))
-    )
-
-
-class InferenceCamera:
+class Camera:
     def __init__(self, render_size, inference_size):
         self._render_size = Size(*render_size)
         self._inference_size = Size(*inference_size)
+
         self._loop = gstreamer.loop()
         self._thread = None
+
         self.on_image = None
 
     @property
@@ -87,7 +25,7 @@
 
     def start_recording(self, obj, format, profile, inline_headers, bitrate, intra_period):
         size = min_outer_size(self._inference_size, self._render_size)
-        view_box = center_inside(self._render_size, size)
+        window = center_inside(self._render_size, size)
         fps_counter = gstreamer.avg_fps_counter(30)
 
         def on_buffer(data, _):
@@ -95,15 +33,14 @@
 
         def on_image(data, _):
             if self.on_image:
-                self.on_image(np.frombuffer(data, dtype=np.uint8), next(fps_counter), size, view_box)
+                self.on_image(np.frombuffer(data, dtype=np.uint8), next(fps_counter), size, window)
 
         signals = {
           'h264sink': {'new-sample': gstreamer.new_sample_callback(on_buffer)},
           'appsink': {'new-sample': gstreamer.new_sample_callback(on_image)},
         }
 
-        pipeline = camera_streaming_pipeline(self._render_size, self._inference_size,
-                                             profile=profile, bitrate=bitrate)
+        pipeline = self.make_pipeline(format, profile, inline_headers, bitrate, intra_period)
 
         self._thread = threading.Thread(target=gstreamer.run_pipeline,
                                         args=(self._loop, pipeline, signals))
@@ -112,3 +49,37 @@
     def stop_recording(self):
         self._loop.quit()
         self._thread.join()
+
+    def make_pipeline(self, fmt, profile, inline_headers, bitrate, intra_period):
+        raise NotImplemented
+
+class FileCamera(Camera):
+    def __init__(self, filename, inference_size):
+        info = gstreamer.get_video_info(filename)
+        super().__init__((info.get_width(), info.get_height()), inference_size)
+        self._filename = filename
+
+    def make_pipeline(self, fmt, profile, inline_headers, bitrate, intra_period):
+        return gstreamer.file_streaming_pipeline(self._filename, self._render_size, self._inference_size)
+
+class V4L2Camera(Camera):
+    def __init__(self, fmt, inference_size):
+        super().__init__(fmt.size, inference_size)
+        self._fmt = fmt
+
+    def make_pipeline(self, fmt, profile, inline_headers, bitrate, intra_period):
+        return (
+            gstreamer.v4l2_camera(self._fmt),
+            gstreamer.camera_streaming_pipeline(profile, bitrate, self._render_size, self._inference_size)
+        )
+
+def make_camera(source, inference_size):
+    fmt = parse_format(source)
+    if fmt:
+        return V4L2Camera(fmt, inference_size)
+
+    filename = os.path.expanduser(source)
+    if os.path.isfile(filename):
+        return FileCamera(filename, inference_size)
+
+    return None
diff --git a/edgetpuvision/classify.py b/edgetpuvision/classify.py
index 2e3a642..7aec55c 100644
--- a/edgetpuvision/classify.py
+++ b/edgetpuvision/classify.py
@@ -8,13 +8,14 @@
 
 import argparse
 import collections
+import itertools
 import time
 
 from edgetpu.classification.engine import ClassificationEngine
 
 from . import gstreamer
 from . import overlays
-from .utils import load_labels
+from .utils import load_labels, input_image_size, same_input_image_sizes
 
 
 def top_results(window, top_k):
@@ -32,6 +33,44 @@
         window.append((yield top_results(window, top_k)))
 
 
+def render_gen(args):
+    acc = accumulator(size=args.window, top_k=args.top_k)
+    acc.send(None)  # Initialize.
+
+    engines = [ClassificationEngine(m) for m in args.model.split(',')]
+    assert same_input_image_sizes(engines)
+    engines = itertools.cycle(engines)
+    engine = next(engines)
+
+    labels = load_labels(args.labels)
+    draw_overlay = True
+
+    yield input_image_size(engine)
+
+    output = None
+    while True:
+        tensor, size, window, inference_rate, command = (yield output)
+
+        if draw_overlay:
+            start = time.monotonic()
+            results = engine.ClassifyWithInputTensor(tensor, threshold=args.threshold, top_k=args.top_k)
+            inference_time = time.monotonic() - start
+
+            results = [(labels[i], score) for i, score in results]
+            results = acc.send(results)
+            if args.print:
+                print(results)
+
+            output = overlays.classification(results, inference_time, inference_rate, size, window)
+        else:
+            output = None
+
+        if command == 'o':
+            draw_overlay = not draw_overlay
+        elif command == 'n':
+            engine = next(engines)
+
+
 def main():
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument('--source',
@@ -55,27 +94,7 @@
                         help='Fullscreen rendering.')
     args = parser.parse_args()
 
-    engine = ClassificationEngine(args.model)
-    labels = load_labels(args.labels)
-
-    acc = accumulator(size=args.window, top_k=args.top_k)
-    acc.send(None)  # Initialize.
-
-    def render_overlay(rgb, size, view_box, inference_fps):
-        start = time.monotonic()
-        results = engine.ClassifyWithInputTensor(rgb, threshold=args.threshold, top_k=args.top_k)
-        inference_time = time.monotonic() - start
-
-        results = [(labels[i], score) for i, score in results]
-        results = acc.send(results)
-        if args.print:
-            print(results)
-
-        return overlays.classification(results, inference_time, inference_fps, size, view_box)
-
-    _, h, w, _ = engine.get_input_tensor_shape()
-
-    if not gstreamer.run((w, h), render_overlay,
+    if not gstreamer.run_gen(render_gen(args),
                          source=args.source,
                          downscale=args.downscale,
                          fullscreen=args.fullscreen):
diff --git a/edgetpuvision/classify_server.py b/edgetpuvision/classify_server.py
index 98cc098..65695c2 100644
--- a/edgetpuvision/classify_server.py
+++ b/edgetpuvision/classify_server.py
@@ -14,15 +14,18 @@
 from edgetpu.classification.engine import ClassificationEngine
 
 from . import overlays
-from .camera import InferenceCamera
+from .camera import make_camera
 from .streaming.server import StreamingServer
-from .utils import load_labels
+from .utils import load_labels, input_image_size
 
 
 def main():
     logging.basicConfig(level=logging.INFO)
 
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--source',
+                        help='/dev/videoN:FMT:WxH:N/D or .mp4 file',
+                        default='/dev/video0:YUY2:1280x720:30/1')
     parser.add_argument('--model', required=True,
                         help='.tflite model path.')
     parser.add_argument('--labels', required=True,
@@ -36,17 +39,17 @@
     engine = ClassificationEngine(args.model)
     labels = load_labels(args.labels)
 
-    _, h, w, _ = engine.get_input_tensor_shape()
+    camera = make_camera(args.source, input_image_size(engine))
+    assert camera is not None
 
-    camera = InferenceCamera((640, 360), (w, h))
     with StreamingServer(camera) as server:
-        def on_image(rgb, inference_fps, size, view_box):
+        def on_image(tensor, inference_fps, size, window):
             start = time.monotonic()
-            results = engine.ClassifyWithInputTensor(rgb, threshold=args.threshold, top_k=args.top_k)
+            results = engine.ClassifyWithInputTensor(tensor, threshold=args.threshold, top_k=args.top_k)
             inference_time = time.monotonic() - start
 
             results = [(labels[i], score) for i, score in results]
-            server.send_overlay(overlays.classification(results, inference_time, inference_fps, size, view_box))
+            server.send_overlay(overlays.classification(results, inference_time, inference_fps, size, window))
 
         camera.on_image = on_image
         signal.pause()
diff --git a/edgetpuvision/detect.py b/edgetpuvision/detect.py
index afe2c1e..f46007d 100644
--- a/edgetpuvision/detect.py
+++ b/edgetpuvision/detect.py
@@ -12,13 +12,48 @@
 #   --labels=${TEST_DATA}/coco_labels.txt
 
 import argparse
+import itertools
 import time
 
 from edgetpu.detection.engine import DetectionEngine
 
 from . import gstreamer
 from . import overlays
-from .utils import load_labels
+from .utils import load_labels, input_image_size, same_input_image_sizes
+
+def render_gen(args):
+    engines = [DetectionEngine(m) for m in args.model.split(',')]
+    assert same_input_image_sizes(engines)
+    engines = itertools.cycle(engines)
+    engine = next(engines)
+
+    labels = load_labels(args.labels) if args.labels else None
+    filtered_labels = set(l.strip() for l in args.filter.split(',')) if args.filter else None
+    draw_overlay = True
+
+    yield input_image_size(engine)
+
+    output = None
+    while True:
+        tensor, size, window, inference_rate, command = (yield output)
+
+        if draw_overlay:
+            start = time.monotonic()
+            objs = engine.DetectWithInputTensor(tensor, threshold=args.threshold, top_k=args.top_k)
+            inference_time  = time.monotonic() - start
+
+            if labels and filtered_labels:
+                objs = [obj for obj in objs if labels[obj.label_id] in filtered_labels]
+
+            output = overlays.detection(objs, labels, inference_time, inference_rate, size, window)
+        else:
+            output = None
+
+        if command == 'o':
+            draw_overlay = not draw_overlay
+        elif command == 'n':
+            engine = next(engines)
+
 
 def main():
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
@@ -40,25 +75,10 @@
                         help='Fullscreen rendering.')
     args = parser.parse_args()
 
-    engine = DetectionEngine(args.model)
-    labels = load_labels(args.labels) if args.labels else None
-    filtered_labels = set(x.strip() for x in args.filter.split(',')) if args.filter else None
-
-    def render_overlay(rgb, size, view_box, inference_fps):
-        start = time.monotonic()
-        objs = engine.DetectWithInputTensor(rgb, threshold=args.threshold, top_k=args.top_k)
-        inference_time  = time.monotonic() - start
-        if labels and filtered_labels:
-            objs = [obj for obj in objs if labels[obj.label_id] in filtered_labels]
-
-        return overlays.detection(objs, inference_time, inference_fps, labels, size, view_box)
-
-    _, h, w, _ = engine.get_input_tensor_shape()
-
-    if not gstreamer.run((w, h), render_overlay,
-                         source=args.source,
-                         downscale=args.downscale,
-                         fullscreen=args.fullscreen):
+    if not gstreamer.run_gen(render_gen(args),
+                          source=args.source,
+                          downscale=args.downscale,
+                          fullscreen=args.fullscreen):
         print('Invalid source argument:', args.source)
 
 
diff --git a/edgetpuvision/detect_server.py b/edgetpuvision/detect_server.py
index e8c1ca8..afcb359 100644
--- a/edgetpuvision/detect_server.py
+++ b/edgetpuvision/detect_server.py
@@ -13,20 +13,24 @@
 
 import argparse
 import logging
+import os
 import signal
 import time
 
 from edgetpu.detection.engine import DetectionEngine
 
 from . import overlays
-from .camera import InferenceCamera
+from .camera import make_camera
 from .streaming.server import StreamingServer
-from .utils import load_labels
+from .utils import load_labels, input_image_size
 
 def main():
     logging.basicConfig(level=logging.INFO)
 
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--source',
+                        help='/dev/videoN:FMT:WxH:N/D or .mp4 file',
+                        default='/dev/video0:YUY2:1280x720:30/1')
     parser.add_argument('--model',
                         help='.tflite model path.', required=True)
     parser.add_argument('--labels',
@@ -40,21 +44,21 @@
 
     engine = DetectionEngine(args.model)
     labels = load_labels(args.labels) if args.labels else None
-    filtered_labels = set(x.strip() for x in args.filter.split(',')) if args.filter else None
+    filtered_labels = set(l.strip() for l in args.filter.split(',')) if args.filter else None
 
-    _, h, w, _ = engine.get_input_tensor_shape()
+    camera = make_camera(args.source, input_image_size(engine))
+    assert camera is not None
 
-    camera = InferenceCamera((640, 360), (w, h))
     with StreamingServer(camera) as server:
-        def on_image(rgb, inference_fps, size, view_box):
+        def on_image(tensor, inference_fps, size, window):
             start = time.monotonic()
-            objs = engine.DetectWithInputTensor(rgb, threshold=args.threshold, top_k=args.top_k)
+            objs = engine.DetectWithInputTensor(tensor, threshold=args.threshold, top_k=args.top_k)
             inference_time = time.monotonic() - start
 
             if labels and filtered_labels:
                 objs = [obj for obj in objs if labels[obj.label_id] in filtered_labels]
 
-            server.send_overlay(overlays.detection(objs, inference_time, inference_fps, labels, size, view_box))
+            server.send_overlay(overlays.detection(objs, labels, inference_time, inference_fps, size, window))
 
         camera.on_image = on_image
         signal.pause()
diff --git a/edgetpuvision/gst.py b/edgetpuvision/gst.py
index 230eafa..7853e11 100644
--- a/edgetpuvision/gst.py
+++ b/edgetpuvision/gst.py
@@ -1,20 +1,34 @@
 import collections
 import itertools
+import re
 
 __all__ = ('Filter', 'Queue', 'Caps', 'Tee',
-           'Size', 'Fraction',
-           'describe', 'max_inner_size', 'min_outer_size', 'center_inside')
+           'Size', 'Fraction', 'Format',
+           'describe', 'max_inner_size', 'min_outer_size', 'center_inside', 'parse_format')
 
-Fraction = collections.namedtuple('Fraction', ['num', 'den'])
+Fraction = collections.namedtuple('Fraction', ('num', 'den'))
 Fraction.__str__ = lambda self: '%s/%s' % (self.num, self.den)
 
-Size = collections.namedtuple('Size', ['width', 'height'])
+Size = collections.namedtuple('Size', ('width', 'height'))
 Size.__mul__ = lambda self, arg: Size(int(arg * self.width), int(arg * self.height))
 Size.__rmul__ = lambda self, arg: Size(int(arg * self.width), int(arg * self.height))
 Size.__floordiv__ = lambda self, arg: Size(self.width // arg, self.height // arg)
 Size.__truediv__ = lambda self, arg: Size(int(self.width / arg), int(self.height / arg))
 Size.__str__ = lambda self: '%dx%d' % self
 
+Format = collections.namedtuple('Format', ('device', 'pixel', 'size', 'framerate'))
+
+V4L2_DEVICE = re.compile(r'(?P<dev>[^:]+):(?P<fmt>[^:]+):(?P<w>\d+)x(?P<h>\d+):(?P<num>\d+)/(?P<den>\d+)')
+
+def parse_format(src):
+    match = V4L2_DEVICE.search(src)
+    if match:
+        return Format(device=match.group('dev'),
+                      pixel=match.group('fmt'),
+                      size=Size(int(match.group('w')), int(match.group('h'))),
+                      framerate=Fraction(int(match.group('num')), int(match.group('den'))))
+    return None
+
 def max_inner_size(what, where):
     # Example: what=(800, 600) where=(300, 300) => (300, 225)
     return what * min(where.width / what.width, where.height / what.height)
@@ -52,10 +66,10 @@
         return self.params[name]
 
 class Filter(Element):
-    def __init__(self, filtername, pins=None, **params):
+    def __init__(self, filtername, pads=None, **params):
         super().__init__(params)
         self.filtername = filtername
-        self.pins = pins
+        self.pads = pads
 
     def __str__(self):
         return join(self.filtername, ' ', self.params)
@@ -76,9 +90,9 @@
         return join(self.mediatype, ',', self.params, ',')
 
 class Tee(Element):
-    def __init__(self, pins=None, **params):
+    def __init__(self, pads=None, **params):
         super().__init__(params)
-        self.pins = pins
+        self.pads = pads
         self.params = params
 
     def __str__(self):
@@ -93,13 +107,13 @@
     elif isinstance(arg, Tee):
         params = params_with_name(arg.params, 't', name_gens)
         return join('tee', ' ', params) + '\n' + \
-             '\n'.join('%s%s. ! %s' % (indent, params['name'], recur(x)) for x in arg.pins)
+             '\n'.join('%s%s. ! %s' % (indent, params['name'], recur(x)) for x in arg.pads)
     elif isinstance(arg, Filter):
         body = join(arg.filtername, ' ', arg.params)
-        if arg.pins:
+        if arg.pads:
             params = params_with_name(arg.params, 'f', name_gens)
             return body + '\n' + \
-              '\n'.join('%s%s.%s ! %s' % (indent, params['name'], pin_name, recur(x)) for pin_name, x in arg.pins.items())
+              '\n'.join('%s%s.%s ! %s' % (indent, params['name'], pad_name, recur(x)) for pad_name, x in arg.pads.items())
         return body
     elif isinstance(arg, Queue):
         return join('queue', ' ', arg.params)
diff --git a/edgetpuvision/gstreamer.py b/edgetpuvision/gstreamer.py
index 43f561a..bd3cc5e 100644
--- a/edgetpuvision/gstreamer.py
+++ b/edgetpuvision/gstreamer.py
@@ -4,7 +4,6 @@
 import functools
 import os
 import queue
-import re
 import sys
 import termios
 import threading
@@ -30,6 +29,8 @@
 
 from .gst import *
 
+COMMAND_SAVE_FRAME = ' '
+COMMAND_PRINT_INFO = 'p'
 
 def set_nonblocking(fd):
     flags = fcntl.fcntl(fd, fcntl.F_GETFL)
@@ -87,15 +88,13 @@
         Filter('appsink', name='appsink', emit_signals=True, max_buffers=1, drop=True, sync=False)
     )
 
-
-# TODO(dkovalev): Image as an input.
 def image_file_pipeline(filename, render_size, inference_size, fullscreen):
     size = max_inner_size(render_size, inference_size)
     return (
         Filter('filesrc', location=filename),
         Filter('decodebin'),
         Filter('imagefreeze'),
-        Tee(pins=((
+        Tee(pads=((
             Queue(max_size_buffers=1),
             Filter('videoconvert'),
             Filter('videoscale'),
@@ -113,7 +112,6 @@
         )))
     )
 
-
 def video_file_pipeline(filename, render_size, inference_size, fullscreen):
     return (
         Filter('filesrc', location=filename),
@@ -121,31 +119,31 @@
         Filter('h264parse'),
         Filter('vpudec'),
         Filter('glupload'),
-        Tee(pins=((
+        Tee(pads=((
             Queue(max_size_buffers=1),
             Filter('glfilterbin', filter='glcolorscale'),
             Filter('rsvgoverlay', name='overlay'),
             Caps('video/x-raw', width=render_size.width, height=render_size.height),
             sink(fullscreen),
         ),(
-            Queue(max_size_buffers=1),
+            Queue(max_size_buffers=1, leaky='downstream'),
             inference_pipeline(render_size, inference_size),
         )))
     )
 
 # v4l2-ctl --list-formats-ext --device /dev/video1
-def v4l2_camera(device, fmt, size, framerate):
+def v4l2_camera(fmt):
     return (
-        Filter('v4l2src', device=device),
-        Caps('video/x-raw', format=fmt, width=size.width, height=size.height,
-             framerate='%d/%d' % framerate),
+        Filter('v4l2src', device=fmt.device),
+        Caps('video/x-raw', format=fmt.pixel, width=fmt.size.width, height=fmt.size.height,
+             framerate='%d/%d' % fmt.framerate),
     )
 
 def video_camera_pipeline(render_size, inference_size, fullscreen):
     return (
         # TODO(dkovalev): Queue(max_size_buffers=1, leaky='downstream'),
         Filter('glupload'),
-        Tee(pins=((
+        Tee(pads=((
             Queue(max_size_buffers=1, leaky='downstream'),
             Filter('glfilterbin', filter='glcolorscale'),
             Filter('rsvgoverlay', name='overlay'),
@@ -156,9 +154,56 @@
         )))
     )
 
-class Command:
-    SAVE_FRAME = 'save_frame'
-    PRINT_INFO = 'print_info'
+def h264sink(display_decoded=False):
+    appsink = Filter('appsink', name='h264sink', emit_signals=True, max_buffers=1, drop=False, sync=False),
+
+    if display_decoded:
+        return Tee(pads=(
+                   (Queue(), appsink),
+                   (Queue(), Filter('vpudec'), Filter('kmssink', sync=False))
+               ))
+
+    return appsink
+
+def file_streaming_pipeline(filename, render_size, inference_size):
+    return (
+        Filter('filesrc', location=filename),
+        Filter('qtdemux'),
+        Tee(pads=((
+          Queue(max_size_buffers=1),
+          Filter('h264parse'),
+          Filter('vpudec'),
+          inference_pipeline(render_size, inference_size),
+        ), (
+          Queue(max_size_buffers=1),
+          Filter('h264parse'),
+          Caps('video/x-h264', stream_format='byte-stream', alignment='nal'),
+          h264sink()
+        )))
+    )
+
+def camera_streaming_pipeline(profile, bitrate, render_size, inference_size):
+    size = max_inner_size(render_size, inference_size)
+    return (
+        Tee(pads=((
+          Queue(),
+          inference_pipeline(render_size, inference_size)
+        ), (
+          Queue(max_size_buffers=1, leaky='downstream'),
+          Filter('videoconvert'),
+          Filter('x264enc',
+                 speed_preset='ultrafast',
+                 tune='zerolatency',
+                 threads=4,
+                 key_int_max=5,
+                 bitrate=int(bitrate / 1000),  # kbit per second.
+                 aud=False),
+          Caps('video/x-h264', profile=profile),
+          Filter('h264parse'),
+          Caps('video/x-h264', stream_format='byte-stream', alignment='nal'),
+          h264sink()
+        )))
+    )
 
 def save_frame(rgb, size, overlay=None, ext='png'):
     tag = '%010d' % int(time.monotonic() * 1000)
@@ -188,18 +233,15 @@
     return Size(structure.get_value('width'),
                 structure.get_value('height'))
 
-def get_video_size(uri):
+def get_video_info(filename):
     #Command line: gst-discoverer-1.0 -v ~/cars_highway.mp4
+    uri = 'file://' + filename
     discoverer = GstPbutils.Discoverer()
     info = discoverer.discover_uri(uri)
 
-    # TODO(dkovalev): Image as an input.
-    #stream_info = info.get_stream_info()
-    #return Size(stream_info.get_width(), stream_info.get_height())
-
     streams = info.get_video_streams()
     assert len(streams) == 1
-    return caps_size(streams[0].get_caps())
+    return streams[0]
 
 def loop():
     return GLib.MainLoop.new(None, False)
@@ -255,7 +297,7 @@
     pipeline.set_state(Gst.State.PLAYING)
     try:
         loop.run()
-    except KeyboardInterrupt as e:
+    except KeyboardInterrupt:
         pass
     finally:
         pipeline.set_state(Gst.State.NULL)
@@ -263,61 +305,67 @@
 
 def on_keypress(fd, flags, commands):
     for ch in sys.stdin.read():
-        if ch == ' ':
-            commands.put(Command.SAVE_FRAME)
-        elif ch == 'i':
-            commands.put(Command.PRINT_INFO)
+        commands.put(ch)
     return True
 
 def on_new_sample(sink, pipeline, render_overlay, render_size, images, commands, fps_counter):
     with pull_sample(sink) as (sample, data):
-        fps = next(fps_counter)
-        svg = render_overlay(np.frombuffer(data, dtype=np.uint8), inference_fps=fps)
-        if svg:
-            overlay = pipeline.get_by_name('overlay')
-            overlay.set_property('data', svg)
+        inference_rate = next(fps_counter)
+        custom_command = None
+        save_frame = False
 
         command = get_nowait(commands)
-        if command is Command.SAVE_FRAME:
-            images.put((data, caps_size(sample.get_caps()), svg))
-        elif command is Command.PRINT_INFO:
+        if command == COMMAND_SAVE_FRAME:
+            save_frame = True
+        elif command == COMMAND_PRINT_INFO:
             print('Timestamp: %.2f' % time.monotonic())
-            print('Inference FPS: %s' % fps)
+            print('Inference FPS: %s' % inference_rate)
             print('Render size: %d x %d' % render_size)
             print('Inference size: %d x %d' % caps_size(sample.get_caps()))
+        else:
+            custom_command = command
+
+        svg = render_overlay(np.frombuffer(data, dtype=np.uint8),
+                             inference_rate=inference_rate,
+                             command=custom_command)
+        overlay = pipeline.get_by_name('overlay')
+        overlay.set_property('data', svg)
+
+        if save_frame:
+            images.put((data, caps_size(sample.get_caps()), svg))
 
     return Gst.FlowReturn.OK
 
 
-V4L2_DEVICE = re.compile(r'(?P<dev>[^:]+):(?P<fmt>[^:]+):(?P<w>\d+)x(?P<h>\d+):(?P<num>\d+)/(?P<den>\d+)')
-
+def run_gen(render_overlay_gen, *, source, downscale, fullscreen):
+    inference_size = render_overlay_gen.send(None)  # Initialize.
+    return run(inference_size,
+        lambda tensor, size, window, inference_rate, command:
+            render_overlay_gen.send((tensor, size, window, inference_rate, command)),
+        source=source,
+        downscale=downscale,
+        fullscreen=fullscreen)
 
 def run(inference_size, render_overlay, *, source, downscale, fullscreen):
-    match = V4L2_DEVICE.search(source)
-    if match:
-        run_camera(inference_size, render_overlay,
-                   device=match.group('dev'),
-                   fmt=match.group('fmt'),
-                   size=(int(match.group('w')), int(match.group('h'))),
-                   framerate=(int(match.group('num')), int(match.group('den'))),
-                   fullscreen=fullscreen)
+    fmt = parse_format(source)
+    if fmt:
+        run_camera(inference_size, render_overlay, fmt, fullscreen)
         return True
-    else:
-        filename = os.path.expanduser(source)
-        if os.path.isfile(filename):
-            run_file(inference_size, render_overlay,
-                     filename=filename,
-                     downscale=downscale,
-                     fullscreen=fullscreen)
-            return True
+
+    filename = os.path.expanduser(source)
+    if os.path.isfile(filename):
+        run_file(inference_size, render_overlay,
+                 filename=filename,
+                 downscale=downscale,
+                 fullscreen=fullscreen)
+        return True
 
     return False
 
-
-def run_camera(inference_size, render_overlay, *, device, fmt, size, framerate, fullscreen):
+def run_camera(inference_size, render_overlay, fmt, fullscreen):
     inference_size = Size(*inference_size)
 
-    camera = v4l2_camera(device, fmt, Size(*size), framerate)
+    camera = v4l2_camera(fmt)
     caps = next(x for x in camera if isinstance(x, Caps))
     render_size = Size(caps.width, caps.height)
     pipeline = camera + video_camera_pipeline(render_size, inference_size, fullscreen)
@@ -327,9 +375,13 @@
 def run_file(inference_size, render_overlay, *, filename, downscale, fullscreen):
     inference_size = Size(*inference_size)
 
-    video_size = get_video_size('file://' + filename)
-    render_size = video_size / downscale
-    pipeline = video_file_pipeline(filename, render_size, inference_size, fullscreen)
+    info = get_video_info(filename)
+    render_size = Size(info.get_width(), info.get_height()) / downscale
+    if info.is_image():
+        pipeline = image_file_pipeline(filename, render_size, inference_size, fullscreen)
+    else:
+        pipeline = video_file_pipeline(filename, render_size, inference_size, fullscreen)
+
     return run_loop(pipeline, inference_size, render_size, render_overlay)
 
 
@@ -346,13 +398,13 @@
             stack.enter_context(term_raw_mode(sys.stdin.fileno()))
 
         size = min_outer_size(inference_size, render_size)
-        view_box = center_inside(render_size, size)
+        window = center_inside(render_size, size)
 
         run_pipeline(loop, pipeline, {'appsink': {'new-sample':
             functools.partial(on_new_sample,
                 render_overlay=functools.partial(render_overlay,
                     size=size,
-                    view_box=view_box),
+                    window=window),
                 render_size=render_size,
                 images=images,
                 commands=commands,
diff --git a/edgetpuvision/overlays.py b/edgetpuvision/overlays.py
index 6d5b884..3eb9a50 100644
--- a/edgetpuvision/overlays.py
+++ b/edgetpuvision/overlays.py
@@ -8,16 +8,16 @@
 def _normalize_rect(rect, size):
     width, height = size
     x0, y0, x1, y1 = rect
-    x, y, w, h = x0, y0, x1 - x0, y1 - y0
-    return int(x * width), int(y * height), int(w * width), int(h * height)
+    return int(x0 * width), int(y0 * height), \
+           int((x1 - x0) * width), int((y1 - y0) * height)
 
 
-def classification(results, inference_time, inference_fps, size, view_box):
-    x0, y0, _, _ = view_box
+def classification(results, inference_time, inference_rate, size, window):
+    x0, y0, _, _ = window
 
     lines = [
         'Inference time: %.2f ms (%.2f fps)' % (inference_time * 1000, 1.0 / inference_time),
-        'Inference frame rate: %.2f fps' % inference_fps
+        'Inference frame rate: %.2f fps' % inference_rate
     ]
 
     for i, (label, score) in enumerate(results):
@@ -26,31 +26,30 @@
     defs = svg.Defs()
     defs += CSS_STYLES
 
-    doc = svg.Svg(viewBox='%s %s %s %s' % view_box, font_size='26px')
+    doc = svg.Svg(viewBox='%s %s %s %s' % window, font_size='26px')
     doc += defs
     doc += svg.normal_text(lines, x=x0 + 10, y=y0 + 10, font_size_em=1.1)
     return str(doc)
 
 
-def detection(objs, inference_time, inference_fps, labels, size, view_box):
-    x0, y0, _, _ = view_box
+def detection(objs, labels, inference_time, inference_rate, size, window):
+    x0, y0, _, _ = window
 
     defs = svg.Defs()
     defs += CSS_STYLES
 
-    doc = svg.Svg(viewBox='%s %s %s %s' % view_box, font_size='26px')
+    doc = svg.Svg(viewBox='%s %s %s %s' % window, font_size='26px')
     doc += defs
     doc += svg.normal_text((
         'Inference time: %.2f ms (%.2f fps)' % (inference_time * 1000, 1.0 / inference_time),
-        'Inference frame rate: %.2f fps' % inference_fps,
+        'Inference frame rate: %.2f fps' % inference_rate,
         'Objects: %d' % len(objs),
     ), x0 + 10, y0 + 10, font_size_em=1.1)
 
     for obj in objs:
         percent = int(100 * obj.score)
         if labels:
-            label = labels[obj.label_id]
-            caption = '%d%% %s' % (percent, label)
+            caption = '%d%% %s' % (percent, labels[obj.label_id])
         else:
             caption = '%d%%' % percent
 
diff --git a/edgetpuvision/utils.py b/edgetpuvision/utils.py
index 2e2e7c9..9495973 100644
--- a/edgetpuvision/utils.py
+++ b/edgetpuvision/utils.py
@@ -1,7 +1,16 @@
 import re
 
+LABEL_PATTERN = re.compile(r'\s*(\d+)(.+)')
+
 def load_labels(path):
-    p = re.compile(r'\s*(\d+)(.+)')
     with open(path, 'r', encoding='utf-8') as f:
-       lines = (p.match(line).groups() for line in f.readlines())
+       lines = (LABEL_PATTERN.match(line).groups() for line in f.readlines())
        return {int(num): text.strip() for num, text in lines}
+
+
+def input_image_size(engine):
+    _, h, w, _ = engine.get_input_tensor_shape()
+    return w, h
+
+def same_input_image_sizes(engines):
+    return len({input_image_size(engine) for engine in engines}) == 1
diff --git a/setup.py b/setup.py
index 997fba2..eaa0313 100644
--- a/setup.py
+++ b/setup.py
@@ -18,6 +18,7 @@
         'protobuf>=3.0.0',
         'edgetpu',
     ],
+    scripts=['bin/edgetpu_demo'],
     entry_points = {
         'console_scripts': ['edgetpu_classify=edgetpuvision.classify:main',
                             'edgetpu_classify_server=edgetpuvision.classify_server:main',