1 files changed, 521 insertions, 0 deletions
diff --git a/drivers/net/can/spi/mcp25xxfd/mcp25xxfd_can_rx.c b/drivers/net/can/spi/mcp25xxfd/mcp25xxfd_can_rx.c
new file mode 100644
index 000000000000..a9e0da434e29
--- /dev/null
+++ b/drivers/net/can/spi/mcp25xxfd/mcp25xxfd_can_rx.c
@@ -0,0 +1,521 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* CAN bus driver for Microchip 25XXFD CAN Controller with SPI Interface
+ *
+ * Copyright 2019 Martin Sperl <kernel@martin.sperl.org>
+ *
+ * Based on Microchip MCP251x CAN controller driver written by
+ * David Vrabel, Copyright 2006 Arcom Control Systems Ltd.
+ */
+
+#include <linux/can/core.h>
+#include <linux/can/dev.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/spi/spi.h>
+
+#include "mcp25xxfd_cmd.h"
+#include "mcp25xxfd_can.h"
+#include "mcp25xxfd_can_debugfs.h"
+#include "mcp25xxfd_can_id.h"
+#include "mcp25xxfd_can_priv.h"
+#include "mcp25xxfd_can_rx.h"
+
+/* module parameters */
+static unsigned int rx_prefetch_bytes = -1;
+module_param(rx_prefetch_bytes, uint, 0664);
+MODULE_PARM_DESC(rx_prefetch_bytes,
+		 "number of bytes to blindly prefetch when reading a rx-fifo");
+
+static struct sk_buff *
+mcp25xxfd_can_rx_submit_normal_frame(struct mcp25xxfd_can_priv *cpriv,
+				     u32 id, u32 dlc, u8 **data)
+{
+	struct can_frame *frame;
+	struct sk_buff *skb;
+
+	/* allocate frame */
+	skb = alloc_can_skb(cpriv->can.dev, &frame);
+	if (!skb)
+		return NULL;
+
+	/* set id, dlc and flags */
+	frame->can_id = id;
+	frame->can_dlc = dlc;
+
+	/* and set the pointer to data */
+	*data = frame->data;
+
+	return skb;
+}
+
+/* it is almost identical except for the type of the frame... */
+static struct sk_buff *
+mcp25xxfd_can_rx_submit_fd_frame(struct mcp25xxfd_can_priv *cpriv,
+				 u32 id, u32 flags, u32 len, u8 **data)
+{
+	struct canfd_frame *frame;
+	struct sk_buff *skb;
+
+	/* allocate frame */
+	skb = alloc_canfd_skb(cpriv->can.dev, &frame);
+	if (!skb)
+		return NULL;
+
+	/* set id, dlc and flags */
+	frame->can_id = id;
+	frame->len = len;
+	frame->flags |= flags;
+
+	/* and set the pointer to data */
+	*data = frame->data;
+
+	return skb;
+}
+
+int mcp25xxfd_can_rx_submit_frame(struct mcp25xxfd_can_priv *cpriv, int fifo)
+{
+	struct net_device *net = cpriv->can.dev;
+	int addr = cpriv->fifos.info[fifo].offset;
+	struct mcp25xxfd_can_obj_rx *rx =
+		(struct mcp25xxfd_can_obj_rx *)(cpriv->sram + addr);
+	u8 *data = NULL;
+	struct sk_buff *skb;
+	u32 id, dlc, len, flags;
+
+	/* compute the can_id */
+	mcp25xxfd_can_id_from_mcp25xxfd(rx->id, rx->flags, &id);
+
+	/* and dlc */
+	dlc = (rx->flags & MCP25XXFD_CAN_OBJ_FLAGS_DLC_MASK) >>
+		MCP25XXFD_CAN_OBJ_FLAGS_DLC_SHIFT;
+	len = can_dlc2len(dlc);
+
+	/* update stats */
+	net->stats.rx_packets++;
+	net->stats.rx_bytes += len;
+	MCP25XXFD_DEBUGFS_INCR(cpriv->fifos.rx.dlc_usage[dlc]);
+	if (rx->flags & MCP25XXFD_CAN_OBJ_FLAGS_FDF)
+		MCP25XXFD_DEBUGFS_INCR(cpriv->fifos.rx.fd_count);
+
+	/* add to rx_history */
+	cpriv->rx_history.dlc[cpriv->rx_history.index] = dlc;
+	cpriv->rx_history.brs[cpriv->rx_history.index] =
+		(rx->flags & MCP25XXFD_CAN_OBJ_FLAGS_BRS) ? CANFD_BRS : 0;
+	cpriv->rx_history.index++;
+	if (cpriv->rx_history.index >= MCP25XXFD_CAN_RX_DLC_HISTORY_SIZE)
+		cpriv->rx_history.index = 0;
+
+	/* allocate the skb buffer */
+	if (rx->flags & MCP25XXFD_CAN_OBJ_FLAGS_FDF) {
+		flags = 0;
+		flags |= (rx->flags & MCP25XXFD_CAN_OBJ_FLAGS_BRS) ?
+			CANFD_BRS : 0;
+		flags |= (rx->flags & MCP25XXFD_CAN_OBJ_FLAGS_ESI) ?
+			CANFD_ESI : 0;
+		skb = mcp25xxfd_can_rx_submit_fd_frame(cpriv, id, flags,
+						       len, &data);
+	} else {
+		skb = mcp25xxfd_can_rx_submit_normal_frame(cpriv, id,
+							   len, &data);
+	}
+	if (!skb) {
+		netdev_err(net, "cannot allocate RX skb\n");
+		net->stats.rx_dropped++;
+		return -ENOMEM;
+	}
+
+	/* copy the payload data */
+	memcpy(data, rx->data, len);
+
+	/* and submit the frame */
+	netif_rx_ni(skb);
+
+	return 0;
+}
+
+static int mcp25xxfd_can_rx_read_frame(struct mcp25xxfd_can_priv *cpriv,
+				       int fifo, int prefetch_bytes, bool read)
+{
+	struct spi_device *spi = cpriv->priv->spi;
+	struct net_device *net = cpriv->can.dev;
+	int addr = cpriv->fifos.info[fifo].offset;
+	struct mcp25xxfd_can_obj_rx *rx =
+		(struct mcp25xxfd_can_obj_rx *)(cpriv->sram + addr);
+	int dlc;
+	int len, ret;
+
+	/* we read the header plus prefetch_bytes */
+	if (read) {
+		MCP25XXFD_DEBUGFS_INCR(cpriv->stats.rx_single_reads);
+		ret = mcp25xxfd_cmd_readn(spi, MCP25XXFD_SRAM_ADDR(addr),
+					  rx, sizeof(*rx) + prefetch_bytes);
+		if (ret)
+			return ret;
+	}
+
+	/* transpose the headers to CPU format */
+	rx->id = le32_to_cpu(*(__le32 *)&rx->id);
+	rx->flags = le32_to_cpu(*(__le32 *)&rx->flags);
+	rx->ts = le32_to_cpu(*(__le32 *)&rx->ts);
+
+	/* compute len */
+	dlc = (rx->flags & MCP25XXFD_CAN_OBJ_FLAGS_DLC_MASK) >>
+		MCP25XXFD_CAN_OBJ_FLAGS_DLC_SHIFT;
+	len = can_dlc2len(min_t(int, dlc, (net->mtu == CANFD_MTU) ? 15 : 8));
+
+	/* read the remaining data for canfd frames */
+	if (read && len > prefetch_bytes) {
+		/* update stats */
+		MCP25XXFD_DEBUGFS_STATS_INCR(cpriv,
+					     rx_reads_prefetched_too_few);
+		MCP25XXFD_DEBUGFS_STATS_ADD(cpriv,
+					    rx_reads_prefetched_too_few_bytes,
+					    len - prefetch_bytes);
+		/* here the extra portion reading data after prefetch */
+		ret = mcp25xxfd_cmd_readn(spi,
+					  MCP25XXFD_SRAM_ADDR(addr) +
+					  sizeof(*rx) + prefetch_bytes,
+					  &rx->data[prefetch_bytes],
+					  len - prefetch_bytes);
+		if (ret)
+			return ret;
+	}
+
+	/* update stats */
+	MCP25XXFD_DEBUGFS_INCR(cpriv->stats.rx_reads);
+	if (len < prefetch_bytes) {
+		MCP25XXFD_DEBUGFS_STATS_INCR(cpriv,
+					     rx_reads_prefetched_too_many);
+		MCP25XXFD_DEBUGFS_STATS_ADD(cpriv,
+					    rx_reads_prefetched_too_many,
+					    prefetch_bytes - len);
+	}
+
+	/* clear the rest of the buffer - just to be safe */
+	memset(rx->data + len, 0, ((net->mtu == CANFD_MTU) ? 64 : 8) - len);
+
+	/* increment the statistics counter */
+	MCP25XXFD_DEBUGFS_INCR(cpriv->fifos.info[fifo].use_count);
+
+	/* add the fifo to the process queues */
+	mcp25xxfd_can_queue_frame(cpriv, fifo, rx->ts, true);
+
+	/* and clear the interrupt flag for that fifo */
+	return mcp25xxfd_cmd_write_mask(spi, MCP25XXFD_CAN_FIFOCON(fifo),
+					MCP25XXFD_CAN_FIFOCON_FRESET,
+					MCP25XXFD_CAN_FIFOCON_FRESET);
+}
+
+static int mcp25xxfd_can_read_rx_frame_bulk(struct mcp25xxfd_can_priv *cpriv,
+					    int fstart,
+					    int fend)
+{
+	struct net_device *net = cpriv->can.dev;
+	int count = abs(fend - fstart) + 1;
+	int flowest = min_t(int, fstart, fend);
+	int addr = cpriv->fifos.info[flowest].offset;
+	struct mcp25xxfd_can_obj_rx *rx =
+		(struct mcp25xxfd_can_obj_rx *)(cpriv->sram + addr);
+	int len = (sizeof(*rx) + ((net->mtu == CANFD_MTU) ? 64 : 8)) * count;
+	int fifo, i, ret;
+
+	/* update stats */
+	MCP25XXFD_DEBUGFS_STATS_INCR(cpriv, rx_bulk_reads);
+	i = min_t(int, MCP25XXFD_CAN_RX_BULK_READ_BINS - 1, count - 1);
+	MCP25XXFD_DEBUGFS_STATS_INCR(cpriv, rx_bulk_read_sizes[i]);
+
+	/* we read the header plus read_min data bytes */
+	ret = mcp25xxfd_cmd_readn(cpriv->priv->spi, MCP25XXFD_SRAM_ADDR(addr),
+				  rx, len);
+	if (ret)
+		return ret;
+
+	/* now process all of them - no need to read... */
+	for (fifo = fstart; count > 0; fifo ++, count--) {
+		ret = mcp25xxfd_can_rx_read_frame(cpriv, fifo, 8, false);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/* predict dlc size based on historic behaviour */
+static int mcp25xxfd_can_rx_predict_prefetch(struct mcp25xxfd_can_priv *cpriv)
+{
+	int dlc, i, top;
+	u8 histo[16];
+
+	/* if we have a prefetch set then use that one */
+	if (rx_prefetch_bytes != -1)
+		return min_t(int, rx_prefetch_bytes,
+			     (cpriv->can.dev->mtu == CANFD_MTU) ? 64 : 8);
+
+	/* memset */
+	memset(histo, 0, sizeof(histo));
+
+	/* for all others compute the histogram */
+	for (i = 0; i < MCP25XXFD_CAN_RX_DLC_HISTORY_SIZE; i++)
+		histo[cpriv->rx_history.dlc[i]]++;
+
+	/* and now find the highest fit */
+	for (i = (cpriv->can.dev->mtu == CANFD_MTU) ? 15 : 8, dlc = 8, top = 0;
+	      i >= 0; i--) {
+		if (top < histo[i]) {
+			top = histo[i];
+			dlc = i;
+		}
+	}
+
+	/* compute length from dlc */
+	cpriv->rx_history.predicted_len = can_dlc2len(dlc);
+
+	/* return the predicted length */
+	return cpriv->rx_history.predicted_len;
+}
+
+/* at least in can2.0 mode we can read multiple RX-fifos in one go
+ * in case they are ajactent to each other and thus we can reduce
+ * the number of spi messages produced and this improves spi-bus
+ * usage efficiency.
+ * In canFD mode this may also be possible, but would need some
+ * statistics to decide if it is worth reading a full 64 bytes
+ * in one go.
+ * But those statistics can get used to predict how many bytes
+ * to read together with the can header (which is fixed to 8 at
+ * this very moment.
+ *
+ * notes on the rational here:
+ * * Reading just the CAN header info takes:
+ *   * bytes read
+ *     *  2 bytes command+address
+ *     * 12 bytes data (id, flags, timestamp)
+ *   * so that is at the very least 112 SCK (= 14 byte * 8 SCK/1 byte)
+ *     - on a Raspberry pi 3 for such short requests actually
+ *       126 SCK (=14 byte * 9 SCK/1 byte)
+ *   * some SPI framework overhead which is observed to be 5-10 us
+ *     on a raspberry pi 3 (time between SCK and stop SCK start)
+ *   * with an effective 17.85 MHz SPI clock on a RPI it takes in total:
+ *     it takes 12us = 6us + 6us
+ * * now reading 8 bytes of CAN data (can2.0) takes:
+ *   * bytes read
+ *     *  2 bytes command+address
+ *     *  8 bytes data
+ *   * so that is at the very least 80 SCK (= 10 byte * 8 SCK/1 byte)
+ *     - on a Raspberry pi 3 for such short requests actually
+ *       90 SCK (= 10 byte * 9 SCK/1 byte)
+ *   * some SPI framework overhead which is observed to be 5-10 us
+ *     on a raspberry pi 3 (time between SCK and stop SCK start)
+ *   * with an effective 17.85 MHz SPI clock on a RPI it takes in total:
+ *     it takes 11us = 5.0us + 6us
+ * * now reading CAN header plus 8 bytes of CAN data (can2.0) takes:
+ *   * bytes read
+ *     *  2 bytes command+address
+ *     * 20 bytes data
+ *   * so that is at the very least 176 SCK (= 22 byte * 8 SCK/1 byte)
+ *     - on a Raspberry pi 3 for such short requests actually
+ *       198 SCK (= 22 byte * 9 SCK/1 byte)
+ *   * some SPI framework overhead which is observed to be 5-10 us
+ *     on a raspberry pi 3 (time between SCK and stop SCK start)
+ *   * with an effective 17.85 MHz SPI clock on a RPI it takes in total:
+ *     it takes 17.1us = 11.1us + 6us
+ *   * this is faster than the 2 individual SPI transfers for header
+ *     and data which is in total 23us
+ *     * this is even true for the case where we only have a single
+ *       data byte (DLC=1) - the time here is 19.5us on a RPI3
+ *     * the only time where we are less efficient is for the DLC=0 case.
+ *       but the assumption here is that this is a rare case
+ * To put it into perspective here the full table for a RPI3:
+ * LE 2m  pr0 pr1 pr2 pr3 pr4 pr5  pr6  pr7  pr8 pr12 pr16 pr20 pr24 pr32 pr48
+ *                                                                         pr64
+ *  0  7.1 7.1
+ *  1 14.6    7.6 8.1 8.6 9.1 9.6 10.1 10.6 11.1 13.1
+ *  2 15.1        8.1 8.6 9.1 9.6 10.1 10.6 11.1 13.1
+ *  3 15.6            8.6 9.1 9.6 10.1 10.6 11.1 13.1 15.1
+ *  4 16.1                9.1 9.6 10.1 10.6 11.1 13.1 15.1
+ *  5 16.6                    9.6 10.1 10.6 11.1 13.1 15.1
+ *  6 17.1                        10.1 10.6 11.1 13.1 15.1
+ *  7 17.6                             10.6 11.1 13.1 15.1 17.1
+ *  8 18.1                                  11.1 13.1 15.1 17.1
+ * 12 20.1                                       13.1 15.1 17.1 19.2
+ * 16 22.1                                            15.1 17.1 19.2
+ * 20 24.1                                                 17.1 19.2 23.2
+ * 24 26.2                                                      19.2 23.2
+ * 32 30.2                                                           23.2
+ * 48 38.3                                                                31.3
+ * 64 46.3                                                                 39.3
+ * (Parameters: SPI Clock=17.8MHz, SCK/byte=9, overhead=6us)
+ * Legend:
+ *   LE = length,
+ *   2m    = 2 SPI messages (header+data - except for LEN=0, only header)
+ *  prX/pX = prefecth length times (only shown when < 2m and Len >= Prefetch)
+ *
+ * The diagonal schows the "optimal" time when the size of the Can frame would
+ * be known ahead of time - i.e if it would be possible to define RX reception
+ * filters based on can DLC values
+ *
+ * So for any Can frame except for LEN=0 the prefetch data solution is
+ * better for prefetch of data=12 for CanFD.
+ *
+ * Here another table showing the optimal prefetch limits for SPI speeds
+ * vs overhead_us at 8 or 9 SCLK/byte
+ *
+ * MHZ  2us@8   2us@9   4us@8   4us@9   6us@8   6us@9   8us@8   8us@9
+ * 10.0 8b***   8b***   8b      8b*     12b**   8b*     12b     12b*
+ * 12.5 8b**    8b***   12b***  8b      12b     12b*    16b*    16b**
+ * 15.0 8b**    8b**    12b**   12b***  16b**   12b     20b**   16b
+ * 17.5 8b*     8b*     12b*    12b**   16b     16b**   20b     20b**
+ * 20.0 8b      8b*     16b***  12b*    20b**   16b     24b*    20b
+ * (a * signifies not a full match, but for any length > count(*))
+ *
+ * So 8 bytes prefetch seems to be a very good tradeoff for can frame
+ * except for DLC/LEN=0 frames.
+ * The question here is mainly: how many frames do we have with DLC=0
+ * vs all others.
+ *
+ * With some statistics of recent CAN frames this may be set dynamically
+ * in the future.
+ *
+ * For this to work efficiently we would also need an estimate on
+ * the SPI framework overhead, which is a function of the spi-bus-driver
+ * implementation details, CPU type and speed as well as system load.
+ * Also the effective SPI-clock speed is needed as well as the
+ * number of spi clock cycles it takes for a single byte to get transferred
+ * The bcm283x SOC for example pauses the SPI clock one cycle after
+ * every byte it sends unless the data is fed to the controller by DMA.
+ * (but for short transfers DMA mapping is very expensive and not worth
+ * the effort. PIO and - in some situations - polling is used instead to
+ * reduce the number of interrupts and the need for thread scheduling as
+ * much as possible)
+ *
+ * This also means that for can2.0 only configured interfaces
+ * reading multiple rx fifos is a realistic option of optimization
+ */
+
+static int mcp25xxfd_can_rx_read_single_frames(struct mcp25xxfd_can_priv *cpriv,
+					       int prefetch)
+{
+	int i, f, ret;
+
+	/* loop all frames */
+	for (i = 0, f = cpriv->fifos.rx.start; i < cpriv->fifos.rx.count;
+	     i++, f++) {
+		if (cpriv->status.rxif & BIT(f)) {
+			/* read the frame */
+			ret = mcp25xxfd_can_rx_read_frame(cpriv, f,
+							  prefetch, true);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+static int mcp25xxfd_can_rx_read_bulk_frames(struct mcp25xxfd_can_priv *cpriv)
+{
+	int i, start, end;
+	int ret;
+
+	/* iterate over fifos trying to find fifos next to each other */
+	for (i = 0, start = cpriv->fifos.rx.start, end = start;
+	     i < cpriv->fifos.rx.count; i++, end++, start = end) {
+		/* if bit is not set then continue */
+		if (!(cpriv->status.rxif & BIT(start)))
+			continue;
+		/* find the last fifo with a bit set in sequence */
+		for (end = start; cpriv->status.rxif & BIT(end + 1); end++)
+			;
+		/* and now read those fifos in bulk */
+		ret = mcp25xxfd_can_read_rx_frame_bulk(cpriv, start, end);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int mcp25xxfd_can_rx_read_fd_frames(struct mcp25xxfd_can_priv *cpriv)
+{
+	int i, count_dlc15, count_brs, prefetch;
+
+	/* get a prediction on prefetch */
+	prefetch = mcp25xxfd_can_rx_predict_prefetch(cpriv);
+
+	/* if the prefetch is < 64 then just read single */
+	if (prefetch < 64)
+		return mcp25xxfd_can_rx_read_single_frames(cpriv, prefetch);
+
+	/* check if we have mostly brs frames of those DLC=15 frames */
+	for (i = 0, count_brs = 0, count_dlc15 = 0;
+	     i < MCP25XXFD_CAN_RX_DLC_HISTORY_SIZE; i++)
+		if (cpriv->rx_history.dlc[i] == 15) {
+			count_dlc15++;
+			if (cpriv->rx_history.brs[i])
+				count_brs++;
+		}
+
+	/* if we have at least 33% brs frames then run bulk */
+	if (count_brs * 3 >= count_dlc15)
+		return mcp25xxfd_can_rx_read_bulk_frames(cpriv);
+	else
+		return mcp25xxfd_can_rx_read_single_frames(cpriv, prefetch);
+}
+
+static int mcp25xxfd_can_rx_read_frames(struct mcp25xxfd_can_priv *cpriv)
+{
+	if (cpriv->can.dev->mtu == CANFD_MTU)
+		return mcp25xxfd_can_rx_read_fd_frames(cpriv);
+	else
+		return mcp25xxfd_can_rx_read_bulk_frames(cpriv);
+}
+
+int mcp25xxfd_can_rx_handle_int_rxif(struct mcp25xxfd_can_priv *cpriv)
+{
+	if (!cpriv->status.rxif)
+		return 0;
+
+	MCP25XXFD_DEBUGFS_STATS_INCR(cpriv, int_rx_count);
+
+	/* read all the fifos */
+	return mcp25xxfd_can_rx_read_frames(cpriv);
+}
+
+int mcp25xxfd_can_rx_handle_int_rxovif(struct mcp25xxfd_can_priv *cpriv)
+{
+	u32 mask = MCP25XXFD_CAN_FIFOSTA_RXOVIF;
+	int ret, i, reg;
+
+	if (!cpriv->status.rxovif)
+		return 0;
+
+	MCP25XXFD_DEBUGFS_STATS_INCR(cpriv, int_rxov_count);
+
+	/* clear all fifos that have an overflow bit set */
+	for (i = 0; i < 32; i++) {
+		if (cpriv->status.rxovif & BIT(i)) {
+			/* clear fifo status */
+			reg = MCP25XXFD_CAN_FIFOSTA(i);
+			ret = mcp25xxfd_cmd_write_mask(cpriv->priv->spi,
+						       reg, 0, mask);
+			if (ret)
+				return ret;
+
+			/* update statistics */
+			cpriv->can.dev->stats.rx_over_errors++;
+			cpriv->can.dev->stats.rx_errors++;
+
+			/* and prepare ERROR FRAME */
+			cpriv->error_frame.id |= CAN_ERR_CRTL;
+			cpriv->error_frame.data[1] |=
+				CAN_ERR_CRTL_RX_OVERFLOW;
+		}
+	}
+
+	return 0;
+}