From 6f990a017e03a781906347828b13913a0b2bcece Mon Sep 17 00:00:00 2001 From: Sebastien Bourdeauducq Date: Sat, 16 Nov 2013 13:53:26 +0100 Subject: [PATCH] dvisampler: pack pixels in pixel clock domain to improve performance --- misoclib/dvisampler/__init__.py | 6 ++-- misoclib/dvisampler/analysis.py | 52 ++++++++++++++++++---------- misoclib/dvisampler/common.py | 1 - misoclib/dvisampler/dma.py | 60 ++++++++++++--------------------- 4 files changed, 58 insertions(+), 61 deletions(-) diff --git a/misoclib/dvisampler/__init__.py b/misoclib/dvisampler/__init__.py index fb1bbb88..612e9141 100644 --- a/misoclib/dvisampler/__init__.py +++ b/misoclib/dvisampler/__init__.py @@ -12,7 +12,7 @@ from misoclib.dvisampler.analysis import SyncPolarity, ResolutionDetection, Fram from misoclib.dvisampler.dma import DMA class DVISampler(Module, AutoCSR): - def __init__(self, pads, asmiport, n_dma_slots=2): + def __init__(self, pads, lasmim, n_dma_slots=2): self.submodules.edid = EDID(pads) self.submodules.clocking = Clocking(pads) @@ -62,7 +62,7 @@ class DVISampler(Module, AutoCSR): self.resdetection.vsync.eq(self.syncpol.vsync) ] - self.submodules.frame = FrameExtraction() + self.submodules.frame = FrameExtraction(24*lasmim.dw//32) self.comb += [ self.frame.valid_i.eq(self.syncpol.valid_o), self.frame.de.eq(self.syncpol.de), @@ -72,7 +72,7 @@ class DVISampler(Module, AutoCSR): self.frame.b.eq(self.syncpol.b) ] - self.submodules.dma = DMA(asmiport, n_dma_slots) + self.submodules.dma = DMA(lasmim, n_dma_slots) self.comb += self.frame.frame.connect(self.dma.frame) self.ev = self.dma.ev diff --git a/misoclib/dvisampler/analysis.py b/misoclib/dvisampler/analysis.py index 17579f70..5108f8d8 100644 --- a/misoclib/dvisampler/analysis.py +++ b/misoclib/dvisampler/analysis.py @@ -5,7 +5,7 @@ from migen.genlib.record import Record from migen.bank.description import * from migen.flow.actor import * -from misoclib.dvisampler.common import channel_layout, frame_layout +from misoclib.dvisampler.common import channel_layout class SyncPolarity(Module): def __init__(self): @@ -106,7 +106,7 @@ class ResolutionDetection(Module, AutoCSR): self.specials += MultiReg(vcounter_st, self._vres.status) class FrameExtraction(Module, AutoCSR): - def __init__(self): + def __init__(self, word_width): # in pix clock domain self.valid_i = Signal() self.vsync = Signal() @@ -116,39 +116,55 @@ class FrameExtraction(Module, AutoCSR): self.b = Signal(8) # in sys clock domain - self.frame = Source(frame_layout) + word_layout = [("parity", 1), ("pixels", word_width)] + self.frame = Source(word_layout) self.busy = Signal() self._r_overflow = CSR() ### - fifo_stb = Signal() - fifo_in = Record(frame_layout) - self.comb += [ - fifo_stb.eq(self.valid_i & self.de), - fifo_in.r.eq(self.r), - fifo_in.g.eq(self.g), - fifo_in.b.eq(self.b), - ] + # start of frame detection vsync_r = Signal() + new_frame = Signal() + self.comb += new_frame.eq(self.vsync & ~vsync_r) + self.sync.pix += vsync_r.eq(self.vsync) + + # pack pixels into words + cur_word = Signal(word_width) + cur_word_valid = Signal() + encoded_pixel = Signal(24) + self.comb += encoded_pixel.eq(Cat(self.b, self.g, self.r)) + pack_factor = word_width//24 + assert(pack_factor & (pack_factor - 1) == 0) # only support powers of 2 + pack_counter = Signal(max=pack_factor) self.sync.pix += [ - If(self.vsync & ~vsync_r, fifo_in.parity.eq(~fifo_in.parity)), - vsync_r.eq(self.vsync) + cur_word_valid.eq(0), + If(new_frame, + pack_counter.eq(0) + ).Elif(self.valid_i & self.de, + [If(pack_counter == (pack_factor-i-1), + cur_word[24*i:24*(i+1)].eq(encoded_pixel)) for i in range(pack_factor)], + Cat(pack_counter, cur_word_valid).eq(pack_counter + 1) + ) ] - fifo = RenameClockDomains(AsyncFIFO(layout_len(frame_layout), 512), + # FIFO + fifo = RenameClockDomains(AsyncFIFO(word_layout, 512), {"write": "pix", "read": "sys"}) self.submodules += fifo self.comb += [ - fifo.we.eq(fifo_stb), - fifo.din.eq(fifo_in.raw_bits()), + fifo.din.pixels.eq(cur_word), + fifo.we.eq(cur_word_valid) + ] + self.sync.pix += If(new_frame, fifo.din.parity.eq(~fifo.din.parity)) + self.comb += [ self.frame.stb.eq(fifo.readable), - self.frame.payload.raw_bits().eq(fifo.dout), + self.frame.payload.eq(fifo.dout), fifo.re.eq(self.frame.ack), self.busy.eq(0) ] - + # overflow detection pix_overflow = Signal() pix_overflow_reset = Signal() diff --git a/misoclib/dvisampler/common.py b/misoclib/dvisampler/common.py index f053237f..7fb9a420 100644 --- a/misoclib/dvisampler/common.py +++ b/misoclib/dvisampler/common.py @@ -1,3 +1,2 @@ control_tokens = [0b1101010100, 0b0010101011, 0b0101010100, 0b1010101011] channel_layout = [("d", 8), ("c", 2), ("de", 1)] -frame_layout = [("parity", 1), ("r", 8), ("g", 8), ("b", 8)] diff --git a/misoclib/dvisampler/dma.py b/misoclib/dvisampler/dma.py index 2f7565db..412b718f 100644 --- a/misoclib/dvisampler/dma.py +++ b/misoclib/dvisampler/dma.py @@ -5,8 +5,6 @@ from migen.bank.eventmanager import * from migen.flow.actor import * from migen.actorlib import dma_lasmi -from misoclib.dvisampler.common import frame_layout - # Slot status: EMPTY=0 LOADED=1 PENDING=2 class _Slot(Module, AutoCSR): def __init__(self, addr_bits, alignment_bits): @@ -65,7 +63,8 @@ class DMA(Module): bus_dw = lasmim.dw alignment_bits = bits_for(bus_dw//8) - 1 - self.frame = Sink(frame_layout) + fifo_word_width = 24*bus_dw//32 + self.frame = Sink([("parity", 1), ("pixels", fifo_word_width)]) self._r_frame_size = CSRStorage(bus_aw + alignment_bits, alignment_bits=alignment_bits) self.submodules._slot_array = _SlotArray(nslots, bus_aw, alignment_bits) self.ev = self._slot_array.ev @@ -98,32 +97,23 @@ class DMA(Module): ) ] - # pack pixels into memory words - write_pixel = Signal() - last_pixel = Signal() - cur_memory_word = Signal(bus_dw) - encoded_pixel = Signal(32) - self.comb += [ - encoded_pixel.eq(Cat( - self.frame.payload.b[6:], self.frame.payload.b, - self.frame.payload.g[6:], self.frame.payload.g, - self.frame.payload.r[6:], self.frame.payload.r)) - ] - pack_factor = bus_dw//32 - assert(pack_factor & (pack_factor - 1) == 0) # only support powers of 2 - pack_counter = Signal(max=pack_factor) - self.comb += last_pixel.eq(pack_counter == (pack_factor - 1)) - self.sync += If(write_pixel, - [If(pack_counter == (pack_factor-i-1), - cur_memory_word[32*i:32*(i+1)].eq(encoded_pixel)) for i in range(pack_factor)], - pack_counter.eq(pack_counter + 1) - ) + # 24bpp -> 32bpp + memory_word = Signal(bus_dw) + pixbits = [] + for i in range(bus_dw//32): + for j in range(3): + b = (i*3+j)*8 + pixbits.append(self.frame.payload.pixels[b+6:b+8]) + pixbits.append(self.frame.payload.pixels[b:b+8]) + pixbits.append(0) + pixbits.append(0) + self.comb += memory_word.eq(Cat(*pixbits)) # bus accessor self.submodules._bus_accessor = dma_lasmi.Writer(lasmim) self.comb += [ self._bus_accessor.address_data.payload.a.eq(current_address), - self._bus_accessor.address_data.payload.d.eq(cur_memory_word) + self._bus_accessor.address_data.payload.d.eq(memory_word) ] # control FSM @@ -133,23 +123,15 @@ class DMA(Module): fsm.act("WAIT_SOF", reset_words.eq(1), self.frame.ack.eq(~self._slot_array.address_valid | ~sof), - If(self._slot_array.address_valid & sof & self.frame.stb, NextState("TRANSFER_PIXEL")) + If(self._slot_array.address_valid & sof & self.frame.stb, NextState("TRANSFER_PIXELS")) ) - fsm.act("TRANSFER_PIXEL", - self.frame.ack.eq(1), + fsm.act("TRANSFER_PIXELS", + self.frame.ack.eq(self._bus_accessor.address_data.ack), If(self.frame.stb, - write_pixel.eq(1), - If(last_pixel, NextState("TO_MEMORY")) - ) - ) - fsm.act("TO_MEMORY", - self._bus_accessor.address_data.stb.eq(1), - If(self._bus_accessor.address_data.ack, - count_word.eq(1), - If(last_word, - NextState("EOF") - ).Else( - NextState("TRANSFER_PIXEL") + self._bus_accessor.address_data.stb.eq(1), + If(self._bus_accessor.address_data.ack, + count_word.eq(1), + If(last_word, NextState("EOF")) ) ) ) -- 2.30.2