From 2ded307f9e05f0c943e657d5d84d3f20cd54f52a Mon Sep 17 00:00:00 2001
From: Daniel J Blueman <daniel@numascale.com>
Date: Wed, 23 Sep 2015 14:44:10 +0800
Subject: [PATCH] Numachip: Implement scalable soft clock

Signed-off-by: Daniel J Blueman <daniel@numascale.com>
---
 drivers/clocksource/Makefile    |   2 +-
 drivers/clocksource/clocktree.c | 260 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 261 insertions(+), 1 deletion(-)
 create mode 100644 drivers/clocksource/clocktree.c

diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile
index 57dfad3..1fed4e3 100644
--- a/drivers/clocksource/Makefile
+++ b/drivers/clocksource/Makefile
@@ -62,4 +62,4 @@ obj-$(CONFIG_H8300)			+= h8300_timer8.o
 obj-$(CONFIG_H8300_TMR16)		+= h8300_timer16.o
 obj-$(CONFIG_H8300_TPU)			+= h8300_tpu.o
 obj-$(CONFIG_CLKSRC_ST_LPC)		+= clksrc_st_lpc.o
-obj-$(CONFIG_X86_NUMACHIP)		+= numachip.o
+obj-$(CONFIG_X86_NUMACHIP)		+= numachip.o clocktree.o
diff --git a/drivers/clocksource/clocktree.c b/drivers/clocksource/clocktree.c
new file mode 100644
index 0000000..c9f315f
--- /dev/null
+++ b/drivers/clocksource/clocktree.c
@@ -0,0 +1,255 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/acpi.h>
+#include <linux/cpu.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/moduleparam.h>
+
+#include <asm/smp.h>
+#include <asm/atomic.h>
+#include <asm/numachip/numachip.h>
+#include <asm/numachip/numachip_csr.h>
+
+#define EXPIRY (u64)(2.5E9 / 50)
+#define MANTISSA 40
+#define PCI_MMIO_CONF(sci, bus, device, func, reg) \
+	(resource_size_t)(mcfg_base | ((u64)sci << 28) | ((bus) << 20) | ((device) << 15) | ((func) << 12) | (reg))
+
+struct clocktree {
+	s64 drift;
+	u64 offset;
+	u64 last_update;
+	u64 nb2core;
+	u64 core2nb;
+	void __iomem *masterp;
+	u32 nb_tsc_last;
+	u8 updating;
+};
+
+struct numachip_info {
+	unsigned layout : 4;
+	unsigned size_x : 4;
+	unsigned size_y : 4;
+	unsigned size_z : 4;
+	unsigned northbridges : 3;
+	unsigned neigh_ht : 3;
+	unsigned neigh_link : 2;
+	unsigned symmetric : 1;
+	unsigned renumbering : 1;
+	unsigned remote_io : 1;
+	unsigned observer : 1;
+	unsigned cores : 8;
+	unsigned ht : 3;
+	u8 partition;
+	u16 fabric_nodes : 12;
+	u16 part_start : 12;
+	u16 part_nodes : 12;
+	unsigned pad : 7;
+	char firmware_ver[18];
+} __attribute__((packed));
+
+static struct numachip_info *numachip_info __read_mostly;
+static DEFINE_PER_CPU_READ_MOSTLY(struct clocktree *, clocktreep);
+static struct clocktree *clocktrees[0xfff] __read_mostly;
+static u64 nb2core __read_mostly;
+static u64 core2nb __read_mostly;
+static void __iomem *masterp __read_mostly;
+static u16 master_sci __read_mostly;
+static u64 __cacheline_aligned_in_smp global_last_wrap;
+static bool clocktree_enabled __read_mostly = 1;
+static u64 offset;
+
+module_param_named(enabled, clocktree_enabled, bool, S_IWUSR | S_IRUGO);
+
+static cycles_t clocktree_read(struct clocksource *cs)
+{
+	struct clocktree *ct;
+	u64 core_tsc, final;
+	u32 wraps;
+
+	preempt_disable();
+	ct = __this_cpu_read(clocktreep);
+	core_tsc = rdtsc();
+
+	/* If stale, update */
+	if (unlikely(core_tsc > (ct->last_update + EXPIRY) && !cmpxchg(&ct->updating, 0, 1))) {
+		u64 nb_total;
+		const u32 nb_tsc = readl(ct->masterp);
+
+		/* Average core TSC reads to remove half remote cycle offset */
+		core_tsc = ((__uint128_t)core_tsc + rdtsc_ordered()) / 2;
+		ct->last_update = core_tsc + EXPIRY;
+
+		/* Check if wrap observed */
+		if (unlikely(nb_tsc < ct->nb_tsc_last)) {
+			const u64 _final = (s64)core_tsc - ct->drift;
+
+			/* Update wrap counter, avoiding racing with others */
+			const u64 _global_last_wrap = ACCESS_ONCE(global_last_wrap);
+			const u32 global_wraps = ((__uint128_t)global_last_wrap * ct->core2nb) >> (32 + MANTISSA);
+			const u32 wraps = ((__uint128_t)_final * ct->core2nb) >> (32 + MANTISSA);
+			if (wraps > global_wraps)
+				/* Expensive invalidation, so prevent racing */
+				cmpxchg(&global_last_wrap, _global_last_wrap, _final);
+		}
+		ct->nb_tsc_last = nb_tsc;
+
+		wraps = ((__uint128_t)global_last_wrap * ct->core2nb) >> (32 + MANTISSA);
+		nb_total = ((u64)wraps << 32) + nb_tsc;
+		final = ((__uint128_t)nb_total * ct->nb2core) >> MANTISSA;
+
+		/* How far remote TSC is ahead of master TSC */
+		ct->drift = (s64)core_tsc - final;
+
+		barrier();
+		ct->updating = 0;
+		smp_wmb();
+		goto out;
+	}
+
+	final = (s64)core_tsc - ct->drift;
+out:
+	preempt_enable();
+	return final - ct->offset;
+}
+
+static struct clocksource clocksource_nb = {
+	.name                   = "clocktree6",
+	.rating                 = 290,
+	.read                   = clocktree_read,
+	.mask                   = CLOCKSOURCE_MASK(64),
+	.flags                  = CLOCK_SOURCE_IS_CONTINUOUS,
+};
+
+static void calibrate(void *param)
+{
+	u64 *val = (void *)param;
+	*val = rdtsc_ordered();
+}
+
+static void clocktree_online_action(void)
+{
+	struct clocktree *ctree;
+	u16 sci;
+
+	preempt_disable();
+	ctree = __this_cpu_read(clocktreep);
+	if (ctree)
+		return;
+
+	sci = read_lcsr(CSR_G0_NODE_IDS) >> 16;
+	if (!clocktrees[sci]) {
+		ctree = kzalloc_node(64 + sizeof(*ctree), GFP_ATOMIC, cpu_to_node(smp_processor_id()));
+		BUG_ON(!ctree);
+		ctree = (struct clocktree *)round_up((u64)ctree, 64);
+
+		ctree->nb2core = nb2core;
+		ctree->core2nb = core2nb;
+		ctree->masterp = masterp;
+		ctree->offset = offset;
+
+		{
+			u64 core0_tsc, nb_total, final, core_tsc = rdtsc_ordered();
+			u32 wraps;
+			const u32 nb_tsc = readl(masterp);
+
+			/* Average core TSC reads to remove half remote cycle offset */
+			core_tsc = ((__uint128_t)core_tsc + rdtsc_ordered()) / 2;
+
+			if (sci == master_sci)
+				core0_tsc = rdtsc_ordered();
+			else
+				smp_call_function_single(0, calibrate, &core0_tsc, 1);
+			wraps = ((__uint128_t)core0_tsc * core2nb) >> (32 + MANTISSA);
+			nb_total = ((u64)wraps << 32) + nb_tsc;
+			final = ((__uint128_t)nb_total * nb2core) >> MANTISSA;
+
+			/* How far remote TSC is ahead of master TSC */
+			ctree->drift = (s64)core_tsc - final;
+		}
+
+		clocktrees[sci] = ctree;
+	}
+
+	__this_cpu_write(clocktreep, clocktrees[sci]);
+	preempt_enable();
+}
+
+static int parse_oemn(struct acpi_table_header *table)
+{
+	char verstr[sizeof(numachip_info->firmware_ver) + 1];
+	numachip_info = kzalloc(sizeof(*numachip_info), GFP_KERNEL);
+	BUG_ON(!numachip_info);
+	memcpy(numachip_info, (char *)table + sizeof(struct acpi_table_header), sizeof(*numachip_info));
+
+	/* NULL-terminate firmware version string */
+	strncpy(verstr, numachip_info->firmware_ver, sizeof(numachip_info->firmware_ver));
+	if (numachip_info->layout >= 5)
+		pr_err("Numaconnect firmware %s\n", verstr);
+
+	return 0;
+}
+
+static int __init clocktree_init(void)
+{
+	u64 val6, mcfg_base;
+	u32 core_mhz, nb_mhz, val, boosts = 0;
+	void __iomem *mapping;
+
+	BUILD_BUG_ON(sizeof(struct clocktree) > 64);
+
+	if (numachip_system != 1)
+		return 0;
+
+	if (!clocktree_enabled) {
+		pr_err("clocktree disabled\n");
+		return 0;
+	}
+
+	if (acpi_table_parse("OEMN", parse_oemn))
+		panic("NumaConnect: OEMN table parsing failed\n");
+
+	if (numachip_info->observer) {
+		pr_err("clocktree: observer mode\n");
+		return 0;
+	}
+
+	master_sci = read_lcsr(CSR_G0_NODE_IDS) >> 16;
+	rdmsrl(MSR_FAM10H_MMIO_CONF_BASE, mcfg_base);
+	mcfg_base &= ~0x7f;
+
+	if (boot_cpu_data.x86 > 0x10) {
+		mapping = ioremap_nocache(PCI_MMIO_CONF(master_sci, 0, 0x18 + numachip_info->neigh_ht, 5, 0x160), 4);
+		val = readl(mapping);
+		nb_mhz = 200 * (((val >> 1) & 0x1f) + 4) / (1 + ((val >> 7) & 1));
+
+		mapping = ioremap_nocache(PCI_MMIO_CONF(master_sci, 0, 0x18 + numachip_info->neigh_ht, 4, 0x15c), 4);
+		boosts = (readl(mapping) >> 2) & 7;
+		iounmap(mapping);
+	} else {
+		mapping = ioremap_nocache(PCI_MMIO_CONF(master_sci, 0, 0x18 + numachip_info->neigh_ht, 3, 0xd4), 4);
+		val = readl(mapping);
+		iounmap(mapping);
+		rdmsrl(0xc0010071, val6);
+		nb_mhz = 200 * ((val & 0x1f) + 4) / (1 + ((val6 >> 22) & 1));
+	}
+
+	rdmsrl(MSR_AMD_PSTATE_DEF_BASE + boosts, val6); /* P0 */
+	core_mhz = 100 * ((val6 & 0x3f) + 0x10) >> ((val6 >> 6) & 7);
+	nb2core = ((u64)core_mhz << MANTISSA) / nb_mhz;
+	core2nb = ((u64)nb_mhz << MANTISSA) / core_mhz;
+	masterp = ioremap_nocache(PCI_MMIO_CONF(master_sci, 0, 0x18 + numachip_info->neigh_ht, 2, 0xb0), 4);
+	pr_err("clocktree: core_mhz=%u nb_mhz=%u\n", core_mhz, nb_mhz);
+
+	offset = rdtsc_ordered();
+	clocktree_online_action(); /* Setup core 0 */
+	x86_cpuinit.early_percpu_clock_init = clocktree_online_action;
+	clocksource_register_khz(&clocksource_nb, core_mhz * 1000);
+	global_last_wrap = rdtsc_ordered();
+
+	return 0;
+}
+
+early_initcall(clocktree_init);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 2e991f8..69419b2 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -432,7 +432,7 @@ struct rcu_data {
 					/*  delay between bouts of */
 					/*  quiescent-state forcing. */

-#define RCU_STALL_RAT_DELAY	2	/* Allow other CPUs time to take */
+#define RCU_STALL_RAT_DELAY	40	/* Allow other CPUs time to take */
 					/*  at least one scheduling clock */
 					/*  irq before ratting on them. */

