@@ -113,7 +113,16 @@ header, is usually reserved at an offset greater than boot memory
size (see Fig. 1). This area is *not* released: this region will
be kept permanently reserved, so that it can act as a receptacle
for a copy of the boot memory content in addition to CPU state
-and HPTE region, in the case a crash does occur.
+and HPTE region, in the case a crash does occur. Since this reserved
+memory area is used only after the system crash, there is no point in
+blocking this significant chunk of memory from production kernel.
+Hence, the implementation marks the memory reserved for fadump as
+ZONE_MOVABLE. With ZONE_MOVABLE this memory will be available for
+applications to use it, while kernel is prevented from using it. With
+this fadump will still be able to capture all of the kernel memory and
+most of the user space memory except the user pages that were present
+in ZONE_MOVABLE region.
+
o Memory Reservation during first kernel
@@ -162,6 +171,9 @@ How to enable firmware-assisted dump (fadump):
1. Set config option CONFIG_FA_DUMP=y and build kernel.
2. Boot into linux kernel with 'fadump=on' kernel cmdline option.
+ By default, the reserved memory will be marked as zone movable.
+ Alternatively, user can boot linux kernel with 'fadump=nonmovable' to
+ prevent fadump to mark reserved memory as zone movable.
3. Optionally, user can also set 'crashkernel=' kernel cmdline
to specify size of the memory to reserve for boot memory dump
preservation.
@@ -172,6 +184,10 @@ NOTE: 1. 'fadump_reserve_mem=' parameter has been deprecated. Instead
2. If firmware-assisted dump fails to reserve memory then it
will fallback to existing kdump mechanism if 'crashkernel='
option is set at kernel cmdline.
+ 3. if user wants to capture all of user space memory and ok with
+ reserved memory not available to production system, then
+ 'fadump=nonmovable' kernel parameter can be used to fallback to
+ old behaviour.
Sysfs/debugfs files:
------------
@@ -48,6 +48,10 @@
#define memblock_num_regions(memblock_type) (memblock.memblock_type.cnt)
+/* Alignement per core mm requirement. */
+#define FADUMP_PAGEBLOCK_ALIGNMENT (PAGE_SIZE << \
+ max_t(unsigned long, MAX_ORDER - 1, pageblock_order))
+
/* Firmware provided dump sections */
#define FADUMP_CPU_STATE_DATA 0x0001
#define FADUMP_HPTE_REGION 0x0002
@@ -141,6 +145,7 @@ struct fw_dump {
unsigned long fadump_supported:1;
unsigned long dump_active:1;
unsigned long dump_registered:1;
+ unsigned long nonmovable:1; /* !ZONE_MOVABLE */
};
/*
@@ -34,6 +34,7 @@
#include <linux/crash_dump.h>
#include <linux/kobject.h>
#include <linux/sysfs.h>
+#include <linux/mmzone.h>
#include <asm/debugfs.h>
#include <asm/page.h>
@@ -375,8 +376,11 @@ int __init fadump_reserve_mem(void)
*/
if (fdm_active)
fw_dump.boot_memory_size = be64_to_cpu(fdm_active->rmr_region.source_len);
- else
+ else {
fw_dump.boot_memory_size = fadump_calculate_reserve_size();
+ fw_dump.boot_memory_size = ALIGN(fw_dump.boot_memory_size,
+ FADUMP_PAGEBLOCK_ALIGNMENT);
+ }
/*
* Calculate the memory boundary.
@@ -423,8 +427,7 @@ int __init fadump_reserve_mem(void)
fw_dump.fadumphdr_addr =
be64_to_cpu(fdm_active->rmr_region.destination_address) +
be64_to_cpu(fdm_active->rmr_region.source_len);
- pr_debug("fadumphdr_addr = %p\n",
- (void *) fw_dump.fadumphdr_addr);
+ pr_debug("fadumphdr_addr = %pa\n", &fw_dump.fadumphdr_addr);
} else {
size = get_fadump_area_size();
@@ -474,6 +477,10 @@ static int __init early_fadump_param(char *p)
fw_dump.fadump_enabled = 1;
else if (strncmp(p, "off", 3) == 0)
fw_dump.fadump_enabled = 0;
+ else if (strncmp(p, "nonmovable", 10) == 0) {
+ fw_dump.fadump_enabled = 1;
+ fw_dump.nonmovable = 1;
+ }
return 0;
}
@@ -1146,7 +1153,7 @@ static int fadump_unregister_dump(struct fadump_mem_struct *fdm)
return 0;
}
-static int fadump_invalidate_dump(struct fadump_mem_struct *fdm)
+static int fadump_invalidate_dump(const struct fadump_mem_struct *fdm)
{
int rc = 0;
unsigned int wait_time;
@@ -1177,9 +1184,8 @@ void fadump_cleanup(void)
{
/* Invalidate the registration only if dump is active. */
if (fw_dump.dump_active) {
- init_fadump_mem_struct(&fdm,
- be64_to_cpu(fdm_active->cpu_state_data.destination_address));
- fadump_invalidate_dump(&fdm);
+ /* pass the same memory dump structure provided by platform */
+ fadump_invalidate_dump(fdm_active);
} else if (fw_dump.dump_registered) {
/* Un-register Firmware-assisted dump if it was registered. */
fadump_unregister_dump(&fdm);
@@ -1525,3 +1531,63 @@ int __init setup_fadump(void)
return 1;
}
subsys_initcall(setup_fadump);
+
+/*
+ * Mark the fadump reserved area as ZONE_MOVABLE.
+ * The total size of fadump reserved memory covers for boot memory size
+ * + cpu data size + hpte size and metadata. Initialize only the area
+ * equivalent to boot memory size as zone movable. The reamining portion
+ * of fadump reserved memory will be not given to movable zone and pages
+ * for thoes will stay reserved. boot memory size is aligned per core mm
+ * requirement to satisy zone_movable_init_reserved_mem() call.
+ * But for some reason even if it fails we still have the memory reservation
+ * with us and we can still continue doing fadump.
+ */
+static int __init fadump_init_reserved_mem(void)
+{
+ unsigned long long base, size;
+ int rc;
+
+ if (!fw_dump.fadump_enabled)
+ return 0;
+
+ /* Ignore if booted with fadump=nonmovable */
+ if (fw_dump.nonmovable)
+ return 0;
+
+ if (fw_dump.dump_active)
+ return 0;
+
+ /*
+ * Mark only the size equivalent to boot memory size as movable
+ * zone.
+ */
+ base = fw_dump.reserve_dump_area_start;
+ size = fw_dump.boot_memory_size;
+
+ if (!size)
+ return 0;
+
+ rc = zone_movable_init_reserved_mem(base, size);
+ if (rc) {
+ pr_err("Failed to init zone movable area for firmware-assisted dump,%d\n", rc);
+ /*
+ * Though the zone movable init has failed, we still have memory
+ * reservation with us. The reserved memory will be
+ * blocked from production system usage. Hence return 1,
+ * so that we can continue with fadump.
+ */
+ return 1;
+ }
+
+ /*
+ * So we now have successfully initialized reserved area as
+ * ZONE_MOVABLE for fadump.
+ */
+ pr_info("Initialized 0x%llx bytes as zone movable area at %ldMB from "
+ "0x%lx bytes of memory reserved for firmware-assisted dump\n",
+ size, (unsigned long)base >> 20,
+ fw_dump.reserve_dump_area_size);
+ return 1;
+}
+core_initcall(fadump_init_reserved_mem);