Index: sys/geom/vinum/geom_vinum.c =================================================================== RCS file: /srv/ncvs/src/sys/geom/vinum/geom_vinum.c,v retrieving revision 1.21 diff -u -u -r1.21 geom_vinum.c --- sys/geom/vinum/geom_vinum.c 30 Mar 2006 14:01:25 -0000 1.21 +++ sys/geom/vinum/geom_vinum.c 3 Nov 2007 02:40:17 -0000 @@ -1,5 +1,6 @@ /*- - * Copyright (c) 2004 Lukas Ertl + * Copyright (c) 2004, 2007 Lukas Ertl + * Copyright (c) 2007 Ulf Lilleengen * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -31,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -40,7 +42,6 @@ #include #include #include -#include #if 0 SYSCTL_DECL(_kern_geom); @@ -49,14 +50,18 @@ "Debug level"); #endif -int gv_create(struct g_geom *, struct gctl_req *); +static int gv_create(struct g_geom *, struct gctl_req *); +static void gv_attach(struct gv_softc *, struct gctl_req *); +static void gv_detach(struct gv_softc *, struct gctl_req *); +static void gv_parityop(struct gv_softc *, struct gctl_req *); + static void gv_orphan(struct g_consumer *cp) { struct g_geom *gp; struct gv_softc *sc; - int error; + struct gv_drive *d; g_topology_assert(); @@ -64,59 +69,84 @@ gp = cp->geom; KASSERT(gp != NULL, ("gv_orphan: null gp")); sc = gp->softc; + KASSERT(sc != NULL, ("gv_orphan: null sc")); + d = cp->private; + KASSERT(d != NULL, ("gv_orphan: null d")); g_trace(G_T_TOPOLOGY, "gv_orphan(%s)", gp->name); - if (cp->acr != 0 || cp->acw != 0 || cp->ace != 0) - g_access(cp, -cp->acr, -cp->acw, -cp->ace); - error = cp->provider->error; - if (error == 0) - error = ENXIO; - g_detach(cp); - g_destroy_consumer(cp); - if (!LIST_EMPTY(&gp->consumer)) - return; - g_free(sc); - g_wither_geom(gp, error); + gv_post_event(sc, GV_EVENT_DRIVE_LOST, d, NULL, 0, 0); } -static void +void gv_start(struct bio *bp) { - struct bio *bp2; struct g_geom *gp; + struct gv_softc *sc; gp = bp->bio_to->geom; - switch(bp->bio_cmd) { + sc = gp->softc; + + switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: - bp2 = g_clone_bio(bp); - if (bp2 == NULL) - g_io_deliver(bp, ENOMEM); - else { - bp2->bio_done = g_std_done; - g_io_request(bp2, LIST_FIRST(&gp->consumer)); - } - return; + break; + case BIO_GETATTR: default: g_io_deliver(bp, EOPNOTSUPP); return; } + + mtx_lock(&sc->queue_mtx); + bioq_disksort(sc->bqueue, bp); + wakeup(sc); + mtx_unlock(&sc->queue_mtx); } -static int +void +gv_done(struct bio *bp) +{ + struct g_geom *gp; + struct gv_softc *sc; + + gp = bp->bio_from->geom; + sc = gp->softc; + bp->bio_cflags |= GV_BIO_DONE; + + mtx_lock(&sc->queue_mtx); + bioq_disksort(sc->bqueue, bp); + wakeup(sc); + mtx_unlock(&sc->queue_mtx); +} + +int gv_access(struct g_provider *pp, int dr, int dw, int de) { struct g_geom *gp; - struct g_consumer *cp; + struct gv_softc *sc; + struct gv_drive *d, *d2; int error; - gp = pp->geom; error = ENXIO; - cp = LIST_FIRST(&gp->consumer); - error = g_access(cp, dr, dw, de); - return (error); + gp = pp->geom; + sc = gp->softc; + LIST_FOREACH(d, &sc->drives, drive) { + if (d->consumer == NULL) + continue; + error = g_access(d->consumer, dr, dw, de); + if (error) { + LIST_FOREACH(d2, &sc->drives, drive) { + if (d == d2) + break; + g_access(d2->consumer, -dr, -dw, -de); + } + printf("VINUM: g_access '%s' failed: %d\n", d->name, + error); + return (error); + } + } + return (0); } static void @@ -135,14 +165,136 @@ gp->softc = g_malloc(sizeof(struct gv_softc), M_WAITOK | M_ZERO); sc = gp->softc; sc->geom = gp; + sc->bqueue = g_malloc(sizeof(struct bio_queue_head), M_WAITOK | M_ZERO); + bioq_init(sc->bqueue); LIST_INIT(&sc->drives); LIST_INIT(&sc->subdisks); LIST_INIT(&sc->plexes); LIST_INIT(&sc->volumes); + TAILQ_INIT(&sc->equeue); + mtx_init(&sc->config_mtx, "gv_config", NULL, MTX_DEF); + mtx_init(&sc->queue_mtx, "gv_queue", NULL, MTX_DEF); + kthread_create(gv_worker, sc, NULL, 0, 0, "gv_worker"); +} + +static int +gv_unload(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) +{ + struct gv_softc *sc; + + g_trace(G_T_TOPOLOGY, "gv_unload(%p)", mp); + + g_topology_assert(); + sc = gp->softc; + + if (sc != NULL) { + gv_post_event(sc, GV_EVENT_THREAD_EXIT, NULL, NULL, 0, 0); + gp->softc = NULL; + g_wither_geom(gp, ENXIO); + return (EAGAIN); + } + + return (0); +} + +/* Handle userland request of attaching object. */ +static void +gv_attach(struct gv_softc *sc, struct gctl_req *req) +{ + struct gv_volume *v; + struct gv_plex *p; + struct gv_sd *s; + off_t *offset; + int *rename, type_child, type_parent; + char *child, *parent; + + child = gctl_get_param(req, "child", NULL); + if (child == NULL) { + gctl_error(req, "no child given"); + return; + } + parent = gctl_get_param(req, "parent", NULL); + if (parent == NULL) { + gctl_error(req, "no parent given"); + return; + } + offset = gctl_get_paraml(req, "offset", sizeof(*offset)); + if (offset == NULL) { + gctl_error(req, "no offset given"); + return; + } + rename = gctl_get_paraml(req, "rename", sizeof(*rename)); + if (rename == NULL) { + gctl_error(req, "no rename flag given"); + return; + } + + type_child = gv_object_type(sc, child); + type_parent = gv_object_type(sc, parent); + + switch (type_child) { + case GV_TYPE_PLEX: + if (type_parent != GV_TYPE_VOL) { + gctl_error(req, "no such volume to attach to"); + return; + } + v = gv_find_vol(sc, parent); + p = gv_find_plex(sc, child); + gv_post_event(sc, GV_EVENT_ATTACH_PLEX, p, v, *offset, *rename); + break; + case GV_TYPE_SD: + if (type_parent != GV_TYPE_PLEX) { + gctl_error(req, "no such plex to attach to"); + return; + } + p = gv_find_plex(sc, parent); +/* if (p->org == GV_PLEX_CONCAT) { + gctl_error(req, "attach on concat plex not allowed"); + break; + }*/ + s = gv_find_sd(sc, child); + gv_post_event(sc, GV_EVENT_ATTACH_SD, s, p, *offset, *rename); + break; + default: + gctl_error(req, "invalid child type"); + break; + } +} + +/* Handle userland request of detaching object. */ +static void +gv_detach(struct gv_softc *sc, struct gctl_req *req) +{ + struct gv_plex *p; + struct gv_sd *s; + int *flags, type; + char *object; + + object = gctl_get_param(req, "object", NULL); + if (object == NULL) { + gctl_error(req, "no argument given"); + return; + } + + flags = gctl_get_paraml(req, "flags", sizeof(*flags)); + type = gv_object_type(sc, object); + switch (type) { + case GV_TYPE_PLEX: + p = gv_find_plex(sc, object); + gv_post_event(sc, GV_EVENT_DETACH_PLEX, p, NULL, *flags, 0); + break; + case GV_TYPE_SD: + s = gv_find_sd(sc, object); + gv_post_event(sc, GV_EVENT_DETACH_SD, s, NULL, *flags, 0); + break; + default: + gctl_error(req, "invalid object type"); + break; + } } /* Handle userland requests for creating new objects. */ -int +static int gv_create(struct g_geom *gp, struct gctl_req *req) { struct gv_softc *sc; @@ -150,10 +302,9 @@ struct gv_plex *p, *p2; struct gv_sd *s, *s2; struct gv_volume *v, *v2; - struct g_consumer *cp; struct g_provider *pp; - int error, i, *drives, *plexes, *subdisks, *volumes; - char buf[20], errstr[ERRBUFSIZ]; + int error, i, *drives, *flags, *plexes, *subdisks, *volumes; + char buf[20]; g_topology_assert(); @@ -164,39 +315,40 @@ plexes = gctl_get_paraml(req, "plexes", sizeof(*plexes)); subdisks = gctl_get_paraml(req, "subdisks", sizeof(*subdisks)); drives = gctl_get_paraml(req, "drives", sizeof(*drives)); + flags = gctl_get_paraml(req, "flags", sizeof(*flags)); /* First, handle drive definitions ... */ for (i = 0; i < *drives; i++) { snprintf(buf, sizeof(buf), "drive%d", i); d2 = gctl_get_paraml(req, buf, sizeof(*d2)); - d = gv_find_drive(sc, d2->name); - if (d != NULL) { - gctl_error(req, "drive '%s' is already known", - d->name); - continue; - } - - d = g_malloc(sizeof(*d), M_WAITOK | M_ZERO); - bcopy(d2, d, sizeof(*d)); - /* - * Make sure that the provider specified in the drive - * specification is an active GEOM provider. + * Make sure that the device specified in the drive config is + * an active GEOM provider. */ - pp = g_provider_by_name(d->device); + pp = g_provider_by_name(d2->device); if (pp == NULL) { - gctl_error(req, "%s: drive not found", d->device); - g_free(d); - return (-1); + gctl_error(req, "%s: device not found", d2->device); + goto error; + } + if (gv_find_drive(sc, d2->name) != NULL) { + /* Ignore error. */ + if (*flags & GV_FLAG_F) + continue; + gctl_error(req, "drive '%s' already exists", d2->name); + goto error; } - d->size = pp->mediasize - GV_DATA_START; - d->avail = d->size; + if (gv_find_drive_device(sc, d2->device) != NULL) { + gctl_error(req, "device '%s' already configured in " + "gvinum", d2->device); + goto error; + } + - gv_config_new_drive(d); + d = g_malloc(sizeof(*d), M_WAITOK | M_ZERO); + bcopy(d2, d, sizeof(*d)); - d->flags |= GV_DRIVE_NEWBORN; - LIST_INSERT_HEAD(&sc->drives, d, drive); + gv_post_event(sc, GV_EVENT_CREATE_DRIVE, d, NULL, 0, 0); } /* ... then volume definitions ... */ @@ -205,19 +357,18 @@ snprintf(buf, sizeof(buf), "volume%d", i); v2 = gctl_get_paraml(req, buf, sizeof(*v2)); - v = gv_find_vol(sc, v2->name); - if (v != NULL) { - gctl_error(req, "volume '%s' is already known", - v->name); - return (-1); + if (gv_find_vol(sc, v2->name) != NULL) { + /* Ignore error. */ + if (*flags & GV_FLAG_F) + continue; + gctl_error(req, "volume '%s' already exists", v2->name); + goto error; } v = g_malloc(sizeof(*v), M_WAITOK | M_ZERO); bcopy(v2, v, sizeof(*v)); - v->vinumconf = sc; - LIST_INIT(&v->plexes); - LIST_INSERT_HEAD(&sc->volumes, v, volume); + gv_post_event(sc, GV_EVENT_CREATE_VOLUME, v, NULL, 0, 0); } /* ... then plex definitions ... */ @@ -226,156 +377,43 @@ snprintf(buf, sizeof(buf), "plex%d", i); p2 = gctl_get_paraml(req, buf, sizeof(*p2)); - p = gv_find_plex(sc, p2->name); - if (p != NULL) { - gctl_error(req, "plex '%s' is already known", p->name); - return (-1); + if (gv_find_plex(sc, p2->name) != NULL) { + /* Ignore error. */ + if (*flags & GV_FLAG_F) + continue; + gctl_error(req, "plex '%s' already exists", p2->name); + goto error; } p = g_malloc(sizeof(*p), M_WAITOK | M_ZERO); bcopy(p2, p, sizeof(*p)); - /* Find the volume this plex should be attached to. */ - v = gv_find_vol(sc, p->volume); - if (v == NULL) { - gctl_error(req, "volume '%s' not found", p->volume); - g_free(p); - continue; - } - if (v->plexcount) - p->flags |= GV_PLEX_ADDED; - p->vol_sc = v; - v->plexcount++; - LIST_INSERT_HEAD(&v->plexes, p, in_volume); - - p->vinumconf = sc; - p->flags |= GV_PLEX_NEWBORN; - LIST_INIT(&p->subdisks); - LIST_INSERT_HEAD(&sc->plexes, p, plex); + gv_post_event(sc, GV_EVENT_CREATE_PLEX, p, NULL, 0, 0); } - /* ... and finally, subdisk definitions. */ + /* ... and, finally, subdisk definitions. */ for (i = 0; i < *subdisks; i++) { error = 0; snprintf(buf, sizeof(buf), "sd%d", i); s2 = gctl_get_paraml(req, buf, sizeof(*s2)); - s = gv_find_sd(sc, s2->name); - if (s != NULL) { - gctl_error(req, "subdisk '%s' is already known", - s->name); - return (-1); + if (gv_find_sd(sc, s2->name) != NULL) { + /* Ignore error. */ + if (*flags & GV_FLAG_F) + continue; + gctl_error(req, "sd '%s' already exists", s2->name); + goto error; } s = g_malloc(sizeof(*s), M_WAITOK | M_ZERO); bcopy(s2, s, sizeof(*s)); - /* Find the drive where this subdisk should be put on. */ - d = gv_find_drive(sc, s->drive); - - /* drive not found - XXX */ - if (d == NULL) { - gctl_error(req, "drive '%s' not found", s->drive); - g_free(s); - continue; - } - - /* Find the plex where this subdisk belongs to. */ - p = gv_find_plex(sc, s->plex); - - /* plex not found - XXX */ - if (p == NULL) { - gctl_error(req, "plex '%s' not found\n", s->plex); - g_free(s); - continue; - } - - /* - * First we give the subdisk to the drive, to handle autosized - * values ... - */ - error = gv_sd_to_drive(sc, d, s, errstr, sizeof(errstr)); - if (error) { - gctl_error(req, errstr); - g_free(s); - continue; - } - - /* - * Then, we give the subdisk to the plex; we check if the - * given values are correct and maybe adjust them. - */ - error = gv_sd_to_plex(p, s, 1); - if (error) { - gctl_error(req, "GEOM_VINUM: couldn't give sd '%s' " - "to plex '%s'\n", s->name, p->name); - if (s->drive_sc) - LIST_REMOVE(s, from_drive); - gv_free_sd(s); - g_free(s); - /* - * If this subdisk can't be created, we won't create - * the attached plex either, if it is also a new one. - */ - if (!(p->flags & GV_PLEX_NEWBORN)) - continue; - LIST_FOREACH_SAFE(s, &p->subdisks, in_plex, s2) { - if (s->drive_sc) - LIST_REMOVE(s, from_drive); - p->sdcount--; - LIST_REMOVE(s, in_plex); - LIST_REMOVE(s, sd); - gv_free_sd(s); - g_free(s); - } - if (p->vol_sc != NULL) { - LIST_REMOVE(p, in_volume); - p->vol_sc->plexcount--; - } - LIST_REMOVE(p, plex); - g_free(p); - continue; - } - s->flags |= GV_SD_NEWBORN; - - s->vinumconf = sc; - LIST_INSERT_HEAD(&sc->subdisks, s, sd); + gv_post_event(sc, GV_EVENT_CREATE_SD, s, NULL, 0, 0); } - LIST_FOREACH(s, &sc->subdisks, sd) - gv_update_sd_state(s); - LIST_FOREACH(p, &sc->plexes, plex) - gv_update_plex_config(p); - LIST_FOREACH(v, &sc->volumes, volume) - gv_update_vol_state(v); - - /* - * Write out the configuration to each drive. If the drive doesn't - * have a valid geom_slice geom yet, attach it temporarily to our VINUM - * geom. - */ - LIST_FOREACH(d, &sc->drives, drive) { - if (d->geom == NULL) { - /* - * XXX if the provider disapears before we get a chance - * to write the config out to the drive, should this - * be handled any differently? - */ - pp = g_provider_by_name(d->device); - if (pp == NULL) { - printf("geom_vinum: %s: drive disapeared?\n", - d->device); - continue; - } - cp = g_new_consumer(gp); - g_attach(cp, pp); - gv_save_config(cp, d, sc); - g_detach(cp); - g_destroy_consumer(cp); - } else - gv_save_config(NULL, d, sc); - d->flags &= ~GV_DRIVE_NEWBORN; - } +error: + gv_post_event(sc, GV_EVENT_SETUP_OBJECTS, sc, NULL, 0, 0); + gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); return (0); } @@ -393,13 +431,21 @@ gp = LIST_FIRST(&mp->geom); sc = gp->softc; - if (!strcmp(verb, "list")) { + if (!strcmp(verb, "attach")) { + gv_attach(sc, req); + + } else if (!strcmp(verb, "concat")) { + gv_concat(gp, req); + + } else if (!strcmp(verb, "detach")) { + gv_detach(sc, req); + + } else if (!strcmp(verb, "list")) { gv_list(gp, req); /* Save our configuration back to disk. */ } else if (!strcmp(verb, "saveconfig")) { - - gv_save_config_all(sc); + gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); /* Return configuration in string form. */ } else if (!strcmp(verb, "getconfig")) { @@ -414,11 +460,18 @@ } else if (!strcmp(verb, "create")) { gv_create(gp, req); + } else if (!strcmp(verb, "mirror")) { + gv_mirror(gp, req); + } else if (!strcmp(verb, "move")) { gv_move(gp, req); - } else if (!strcmp(verb, "parityop")) { - gv_parityop(gp, req); + } else if (!strcmp(verb, "raid5")) { + gv_raid5(gp, req); + + } else if (!strcmp(verb, "rebuildparity") || + !strcmp(verb, "checkparity")) { + gv_parityop(sc, req); } else if (!strcmp(verb, "remove")) { gv_remove(gp, req); @@ -427,100 +480,498 @@ gv_rename(gp, req); } else if (!strcmp(verb, "resetconfig")) { - gv_resetconfig(gp, req); + gv_post_event(sc, GV_EVENT_RESET_CONFIG, sc, NULL, 0, 0); } else if (!strcmp(verb, "start")) { gv_start_obj(gp, req); + } else if (!strcmp(verb, "stripe")) { + gv_stripe(gp, req); + } else if (!strcmp(verb, "setstate")) { gv_setstate(gp, req); - } else gctl_error(req, "Unknown verb parameter"); } -#if 0 -static int -gv_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) +static void +gv_parityop(struct gv_softc *sc, struct gctl_req *req) { - struct g_geom *gp2; + struct gv_plex *p; + int *flags, *rebuild, type; + char *plex; + + plex = gctl_get_param(req, "plex", NULL); + if (plex == NULL) { + gctl_error(req, "no plex given"); + return; + } + + flags = gctl_get_paraml(req, "flags", sizeof(*flags)); + if (flags == NULL) { + gctl_error(req, "no flags given"); + return; + } + + rebuild = gctl_get_paraml(req, "rebuild", sizeof(*rebuild)); + if (rebuild == NULL) { + gctl_error(req, "no operation given"); + return; + } + + type = gv_object_type(sc, plex); + if (type != GV_TYPE_PLEX) { + gctl_error(req, "'%s' is not a plex", plex); + return; + } + p = gv_find_plex(sc, plex); + + if (p->state != GV_PLEX_UP) { + gctl_error(req, "plex %s is not completely accessible", + p->name); + return; + } + + if (p->org != GV_PLEX_RAID5) { + gctl_error(req, "plex %s is not a RAID5 plex", p->name); + return; + } + + /* Put it in the event queue. */ + /* XXX: The state of the plex might have changed when this event is + * picked up ... We should perhaps check this afterwards. */ + if (*rebuild) + gv_post_event(sc, GV_EVENT_PARITY_REBUILD, p, NULL, 0, 0); + else + gv_post_event(sc, GV_EVENT_PARITY_CHECK, p, NULL, 0, 0); +} + + +static struct g_geom * +gv_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) +{ + struct g_geom *gp; struct g_consumer *cp; struct gv_softc *sc; - struct gv_drive *d, *d2; - struct gv_plex *p, *p2; - struct gv_sd *s, *s2; - struct gv_volume *v, *v2; - struct gv_freelist *fl, *fl2; + struct gv_hdr *vhdr; - g_trace(G_T_TOPOLOGY, "gv_destroy_geom: %s", gp->name); - g_topology_assert(); + vhdr = NULL; - KASSERT(gp != NULL, ("gv_destroy_geom: null gp")); - KASSERT(gp->softc != NULL, ("gv_destroy_geom: null sc")); + g_topology_assert(); + g_trace(G_T_TOPOLOGY, "gv_taste(%s, %s)", mp->name, pp->name); + gp = LIST_FIRST(&mp->geom); + if (gp == NULL) { + printf("VINUM error: tasting, but not initialized?\n"); + return (NULL); + } sc = gp->softc; - /* - * Check if any of our drives is still open; if so, refuse destruction. - */ - LIST_FOREACH(d, &sc->drives, drive) { - gp2 = d->geom; - cp = LIST_FIRST(&gp2->consumer); - if (cp != NULL) - g_access(cp, -1, -1, -1); - if (gv_is_open(gp2)) - return (EBUSY); - } - - /* Clean up and deallocate what we allocated. */ - LIST_FOREACH_SAFE(d, &sc->drives, drive, d2) { - LIST_REMOVE(d, drive); - g_free(d->hdr); - d->hdr = NULL; - LIST_FOREACH_SAFE(fl, &d->freelist, freelist, fl2) { - d->freelist_entries--; - LIST_REMOVE(fl, freelist); - g_free(fl); - fl = NULL; - } - d->geom->softc = NULL; - g_free(d); - } - - LIST_FOREACH_SAFE(s, &sc->subdisks, sd, s2) { - LIST_REMOVE(s, sd); - s->drive_sc = NULL; - s->plex_sc = NULL; - s->provider = NULL; - s->consumer = NULL; - g_free(s); - } - - LIST_FOREACH_SAFE(p, &sc->plexes, plex, p2) { - LIST_REMOVE(p, plex); - gv_kill_thread(p); - p->vol_sc = NULL; - p->geom->softc = NULL; - p->provider = NULL; - p->consumer = NULL; - if (p->org == GV_PLEX_RAID5) { - mtx_destroy(&p->worklist_mtx); - } - g_free(p); - } - - LIST_FOREACH_SAFE(v, &sc->volumes, volume, v2) { - LIST_REMOVE(v, volume); - v->geom->softc = NULL; - g_free(v); - } - - gp->softc = NULL; - g_free(sc); - g_wither_geom(gp, ENXIO); - return (0); + cp = g_new_consumer(gp); + if (g_attach(cp, pp) != 0) { + g_destroy_consumer(cp); + return (NULL); + } + if (g_access(cp, 1, 0, 0) != 0) { + g_detach(cp); + g_destroy_consumer(cp); + return (NULL); + } + g_topology_unlock(); + + vhdr = g_read_data(cp, GV_HDR_OFFSET, pp->sectorsize, NULL); + + g_topology_lock(); + g_access(cp, -1, 0, 0); + g_detach(cp); + g_destroy_consumer(cp); + + /* Check if what we've been given is a valid vinum drive. */ + if (vhdr != NULL) { + if (vhdr->magic == GV_MAGIC) + gv_post_event(sc, GV_EVENT_DRIVE_TASTED, pp, NULL, 0, + 0); + g_free(vhdr); + } + + return (NULL); +} + +void +gv_worker(void *arg) +{ + struct g_provider *pp; + struct gv_softc *sc; + struct gv_event *ev; + struct gv_volume *v; + struct gv_plex *p; + struct gv_sd *s; + struct gv_drive *d; + struct bio *bp; + int newstate, flags, err, rename; + char *newname; + off_t offset; + + sc = arg; + KASSERT(sc != NULL, ("NULL sc")); + mtx_lock(&sc->queue_mtx); + for (;;) { + /* Look at the events first... */ + ev = TAILQ_FIRST(&sc->equeue); + if (ev != NULL) { + TAILQ_REMOVE(&sc->equeue, ev, events); + mtx_unlock(&sc->queue_mtx); + + switch (ev->type) { + case GV_EVENT_DRIVE_TASTED: + printf("VINUM: event 'drive tasted'\n"); + pp = ev->arg1; + gv_drive_tasted(sc, pp); + break; + + case GV_EVENT_DRIVE_LOST: + printf("VINUM: event 'drive lost'\n"); + d = ev->arg1; + gv_drive_lost(sc, d); + break; + + case GV_EVENT_CREATE_DRIVE: + printf("VINUM: event 'create drive'\n"); + d = ev->arg1; + gv_create_drive(sc, d); + break; + + case GV_EVENT_CREATE_VOLUME: + printf("VINUM: event 'create volume'\n"); + v = ev->arg1; + gv_create_volume(sc, v); + break; + + case GV_EVENT_CREATE_PLEX: + printf("VINUM: event 'create plex'\n"); + p = ev->arg1; + gv_create_plex(sc, p); + break; + + case GV_EVENT_CREATE_SD: + printf("VINUM: event 'create sd'\n"); + s = ev->arg1; + gv_create_sd(sc, s); + break; + + case GV_EVENT_RM_DRIVE: + printf("VINUM: event 'remove drive'\n"); + d = ev->arg1; + flags = ev->arg3; + gv_rm_drive(sc, d, flags); + /*gv_setup_objects(sc);*/ + break; + + case GV_EVENT_RM_VOLUME: + printf("VINUM: event 'remove volume'\n"); + v = ev->arg1; + gv_rm_vol(sc, v); + /*gv_setup_objects(sc);*/ + break; + + case GV_EVENT_RM_PLEX: + printf("VINUM: event 'remove plex'\n"); + p = ev->arg1; + gv_rm_plex(sc, p); + /*gv_setup_objects(sc);*/ + break; + + case GV_EVENT_RM_SD: + printf("VINUM: event 'remove sd'\n"); + s = ev->arg1; + gv_rm_sd(sc, s); + /*gv_setup_objects(sc);*/ + break; + + case GV_EVENT_SAVE_CONFIG: + printf("VINUM: event 'save config'\n"); + gv_save_config(sc); + break; + + case GV_EVENT_SET_SD_STATE: + printf("VINUM: event 'setstate sd'\n"); + s = ev->arg1; + newstate = ev->arg3; + flags = ev->arg4; + err = gv_set_sd_state(s, newstate, flags); + if (err) + printf("VINUM: error setting subdisk " + "state: error code %d\n", err); + break; + + case GV_EVENT_SET_DRIVE_STATE: + printf("VINUM: event 'setstate drive'\n"); + d = ev->arg1; + newstate = ev->arg3; + flags = ev->arg4; + err = gv_set_drive_state(d, newstate, flags); + if (err) + printf("VINUM: error setting drive " + "state: error code %d\n", err); + break; + + case GV_EVENT_SET_VOL_STATE: + printf("VINUM: event 'setstate volume'\n"); + v = ev->arg1; + newstate = ev->arg3; + flags = ev->arg4; + err = gv_set_vol_state(v, newstate, flags); + if (err) + printf("VINUM: error setting volume " + "state: error code %d\n", err); + break; + + case GV_EVENT_SET_PLEX_STATE: + printf("VINUM: event 'setstate plex'\n"); + p = ev->arg1; + newstate = ev->arg3; + flags = ev->arg4; + err = gv_set_plex_state(p, newstate, flags); + if (err) + printf("VINUM: error setting plex " + "state: error code %d\n", err); + break; + + case GV_EVENT_SETUP_OBJECTS: + printf("VINUM: event 'setup objects'\n"); + gv_setup_objects(sc); + break; + + case GV_EVENT_RESET_CONFIG: + printf("VINUM: event 'resetconfig'\n"); + err = gv_resetconfig(sc); + if (err) + printf("VINUM: error resetting config: " + "error code %d\n", err); + break; + + case GV_EVENT_PARITY_REBUILD: + /* + * Start the rebuild. The gv_plex_done will + * handle issuing of the remaining rebuild bio's + * until it's finished. + */ + printf("VINUM: event 'rebuild'\n"); + p = ev->arg1; + if (p->state != GV_PLEX_UP) { + printf("VINUM: plex %s is not " + "completely accessible", p->name); + break; + } + p->synced = 0; + g_topology_assert_not(); + g_topology_lock(); + err = gv_access(p->vol_sc->provider, 1, 1, 0); + if (err) { + printf("VINUM: unable to access " + "provider\n"); + break; + } + g_topology_unlock(); + gv_parity_request(p, GV_BIO_CHECK | + GV_BIO_PARITY, 0); + break; + + case GV_EVENT_PARITY_CHECK: + /* Start parity check. */ + printf("VINUM: event 'check'\n"); + p = ev->arg1; + if (p->state != GV_PLEX_UP) { + printf("VINUM: plex %s is not " + "completely accessible", p->name); + break; + } + p->synced = 0; + g_topology_assert_not(); + g_topology_lock(); + err = gv_access(p->vol_sc->provider, 1, 1, 0); + if (err) { + printf("VINUM: unable to access " + "provider\n"); + break; + } + g_topology_unlock(); + gv_parity_request(p, GV_BIO_CHECK, 0); + break; + + case GV_EVENT_START_PLEX: + printf("VINUM: event 'start'\n"); + p = ev->arg1; + gv_start_plex(p); + break; + + case GV_EVENT_START_VOLUME: + printf("VINUM: event 'start'\n"); + v = ev->arg1; + gv_start_vol(v); + break; + + case GV_EVENT_ATTACH_PLEX: + printf("VINUM: event 'attach'\n"); + p = ev->arg1; + v = ev->arg2; + rename = ev->arg4; + err = gv_attach_plex(p, v, rename); + if (err) + printf("VINUM: error attaching %s to " + "%s: error code %d\n", p->name, + v->name, err); + break; + + case GV_EVENT_ATTACH_SD: + printf("VINUM: event 'attach'\n"); + s = ev->arg1; + p = ev->arg2; + offset = ev->arg3; + rename = ev->arg4; + err = gv_attach_sd(s, p, offset, rename); + if (err) + printf("VINUM: error attaching %s to " + "%s: error code %d\n", s->name, + p->name, err); + break; + + case GV_EVENT_DETACH_PLEX: + printf("VINUM: event 'detach'\n"); + p = ev->arg1; + flags = ev->arg3; + err = gv_detach_plex(p, flags); + if (err) + printf("VINUM: error detaching %s: " + "error code %d\n", p->name, err); + break; + + case GV_EVENT_DETACH_SD: + printf("VINUM: event 'detach'\n"); + s = ev->arg1; + flags = ev->arg3; + err = gv_detach_sd(s, flags); + if (err) + printf("VINUM: error detaching %s: " + "error code %d\n", s->name, err); + break; + + case GV_EVENT_RENAME_VOL: + printf("VINUM: event 'rename'\n"); + v = ev->arg1; + newname = ev->arg2; + flags = ev->arg3; + err = gv_rename_vol(sc, v, newname, flags); + if (err) + printf("VINUM: error renaming %s to %s:" + " error code %d\n", v->name, + newname, err); + g_free(newname); + break; + + case GV_EVENT_RENAME_PLEX: + printf("VINUM: event 'rename'\n"); + p = ev->arg1; + newname = ev->arg2; + flags = ev->arg3; + err = gv_rename_plex(sc, p, newname, flags); + if (err) + printf("VINUM: error renaming %s to %s:" + " error code %d\n", p->name, + newname, err); + g_free(newname); + break; + + case GV_EVENT_RENAME_SD: + printf("VINUM: event 'rename'\n"); + s = ev->arg1; + newname = ev->arg2; + flags = ev->arg3; + err = gv_rename_sd(sc, s, newname, flags); + if (err) + printf("VINUM: error renaming %s to %s:" + " error code %d\n", s->name, + newname, err); + g_free(newname); + break; + + case GV_EVENT_RENAME_DRIVE: + printf("VINUM: event 'rename'\n"); + d = ev->arg1; + newname = ev->arg2; + flags = ev->arg3; + err = gv_rename_drive(sc, d, newname, flags); + if (err) + printf("VINUM: error renaming %s to %s:" + " error code %d\n", d->name, + newname, err); + g_free(newname); + break; + + case GV_EVENT_MOVE_SD: + printf("VINUM: event 'move'\n"); + s = ev->arg1; + d = ev->arg2; + flags = ev->arg3; + err = gv_move_sd(sc, s, d, flags); + if (err) + printf("VINUM: error moving %s to %s: " + " error code %d\n", s->name, + d->name, err); + break; + + case GV_EVENT_THREAD_EXIT: + printf("VINUM: event 'thread exit'\n"); + g_free(ev); + mtx_lock(&sc->queue_mtx); + gv_cleanup(sc); + mtx_destroy(&sc->queue_mtx); + g_free(sc->bqueue); + g_free(sc); + kthread_exit(ENXIO); + break; /* not reached */ + + default: + printf("VINUM: unknown event %d\n", ev->type); + } + + g_free(ev); + + mtx_lock(&sc->queue_mtx); + continue; + } + + /* ... then do I/O processing. */ + bp = bioq_takefirst(sc->bqueue); + if (bp == NULL) { + msleep(sc, &sc->queue_mtx, PRIBIO, "-", hz/10); + continue; + } + mtx_unlock(&sc->queue_mtx); + + /* A bio that is coming up from an underlying device. */ + if (bp->bio_cflags & GV_BIO_DONE) { + gv_bio_done(sc, bp); + /* A bio that interfered with another bio. */ + } else if (bp->bio_cflags & GV_BIO_ONHOLD) { + s = bp->bio_caller1; + p = s->plex_sc; + /* Is it still locked out? */ + if (gv_stripe_active(p, bp)) { + /* Park the bio on the waiting queue. */ + bioq_disksort(p->wqueue, bp); + } else { + bp->bio_cflags &= ~GV_BIO_ONHOLD; + g_io_request(bp, s->drive_sc->consumer); + } + /* A fresh bio, scheduled it down. */ + } else { + gv_volume_start(sc, bp); + } + + mtx_lock(&sc->queue_mtx); + } } -#endif #define VINUM_CLASS_NAME "VINUM" @@ -528,8 +979,9 @@ .name = VINUM_CLASS_NAME, .version = G_VERSION, .init = gv_init, - /*.destroy_geom = gv_destroy_geom,*/ + .taste = gv_taste, .ctlreq = gv_config, + .destroy_geom = gv_unload, }; DECLARE_GEOM_CLASS(g_vinum_class, g_vinum); Index: sys/geom/vinum/geom_vinum.h =================================================================== RCS file: /srv/ncvs/src/sys/geom/vinum/geom_vinum.h,v retrieving revision 1.13 diff -u -u -r1.13 geom_vinum.h --- sys/geom/vinum/geom_vinum.h 12 Apr 2007 17:54:35 -0000 1.13 +++ sys/geom/vinum/geom_vinum.h 3 Nov 2007 02:40:17 -0000 @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2004 Lukas Ertl + * Copyright (c) 2004, 2007 Lukas Ertl * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -29,18 +29,19 @@ #ifndef _GEOM_VINUM_H_ #define _GEOM_VINUM_H_ -#define ERRBUFSIZ 1024 +/* geom_vinum_create.c */ +void gv_concat(struct g_geom *gp, struct gctl_req *); +void gv_mirror(struct g_geom *gp, struct gctl_req *); +void gv_stripe(struct g_geom *gp, struct gctl_req *); +void gv_raid5(struct g_geom *gp, struct gctl_req *); /* geom_vinum_drive.c */ -void gv_config_new_drive(struct gv_drive *); -void gv_drive_modify(struct gv_drive *); -void gv_save_config_all(struct gv_softc *); -void gv_save_config(struct g_consumer *, struct gv_drive *, - struct gv_softc *); +void gv_save_config(struct gv_softc *); /* geom_vinum_init.c */ -void gv_parityop(struct g_geom *, struct gctl_req *); void gv_start_obj(struct g_geom *, struct gctl_req *); +int gv_start_plex(struct gv_plex *); +int gv_start_vol(struct gv_volume *); /* geom_vinum_list.c */ void gv_ld(struct g_geom *, struct gctl_req *, struct sbuf *); @@ -51,21 +52,31 @@ /* geom_vinum_move.c */ void gv_move(struct g_geom *, struct gctl_req *); +int gv_move_sd(struct gv_softc *, struct gv_sd *, struct gv_drive *, int); /* geom_vinum_rename.c */ void gv_rename(struct g_geom *, struct gctl_req *); +int gv_rename_drive(struct gv_softc *, struct gv_drive *, char *, int); +int gv_rename_plex(struct gv_softc *, struct gv_plex *, char *, int); +int gv_rename_sd(struct gv_softc *, struct gv_sd *, char *, int); +int gv_rename_vol(struct gv_softc *, struct gv_volume *, char *, int); /* geom_vinum_rm.c */ void gv_remove(struct g_geom *, struct gctl_req *); -int gv_resetconfig(struct g_geom *, struct gctl_req *); -int gv_rm_sd(struct gv_softc *sc, struct gctl_req *req, - struct gv_sd *s, int flags); +int gv_resetconfig(struct gv_softc *); +void gv_rm_sd(struct gv_softc *sc, struct gv_sd *s); +void gv_rm_drive(struct gv_softc *, struct gv_drive *, int); +void gv_rm_plex(struct gv_softc *, struct gv_plex *); +void gv_rm_vol(struct gv_softc *, struct gv_volume *); + /* geom_vinum_state.c */ int gv_sdstatemap(struct gv_plex *); void gv_setstate(struct g_geom *, struct gctl_req *); int gv_set_drive_state(struct gv_drive *, int, int); int gv_set_sd_state(struct gv_sd *, int, int); +int gv_set_vol_state(struct gv_volume *, int, int); +int gv_set_plex_state(struct gv_plex *, int, int); void gv_update_sd_state(struct gv_sd *); void gv_update_plex_state(struct gv_plex *); void gv_update_vol_state(struct gv_volume *); @@ -73,25 +84,64 @@ /* geom_vinum_subr.c */ void gv_adjust_freespace(struct gv_sd *, off_t); void gv_free_sd(struct gv_sd *); -struct g_geom *find_vinum_geom(void); struct gv_drive *gv_find_drive(struct gv_softc *, char *); +struct gv_drive *gv_find_drive_device(struct gv_softc *, char *); struct gv_plex *gv_find_plex(struct gv_softc *, char *); struct gv_sd *gv_find_sd(struct gv_softc *, char *); struct gv_volume *gv_find_vol(struct gv_softc *, char *); void gv_format_config(struct gv_softc *, struct sbuf *, int, char *); int gv_is_striped(struct gv_plex *); -int gv_is_open(struct g_geom *); -void gv_kill_drive_thread(struct gv_drive *); -void gv_kill_plex_thread(struct gv_plex *); -void gv_kill_vol_thread(struct gv_volume *); +int gv_consumer_is_open(struct g_consumer *); +int gv_provider_is_open(struct g_provider *); int gv_object_type(struct gv_softc *, char *); -void gv_parse_config(struct gv_softc *, u_char *, int); -int gv_sd_to_drive(struct gv_softc *, struct gv_drive *, struct gv_sd *, - char *, int); -int gv_sd_to_plex(struct gv_plex *, struct gv_sd *, int); +void gv_parse_config(struct gv_softc *, char *, struct gv_drive *); +int gv_sd_to_drive(struct gv_sd *, struct gv_drive *); +int gv_sd_to_plex(struct gv_sd *, struct gv_plex *); +int gv_sdcount(struct gv_plex *, int); void gv_update_plex_config(struct gv_plex *); void gv_update_vol_size(struct gv_volume *, off_t); off_t gv_vol_size(struct gv_volume *); off_t gv_plex_size(struct gv_plex *); +int gv_plexdown(struct gv_volume *); +int gv_attach_plex(struct gv_plex *, struct gv_volume *, int); +int gv_attach_sd(struct gv_sd *, struct gv_plex *, off_t, int); +int gv_detach_plex(struct gv_plex *, int); +int gv_detach_sd(struct gv_sd *, int); + +void gv_worker(void *); +void gv_post_event(struct gv_softc *, int, void *, void *, intmax_t, + intmax_t); +void gv_drive_tasted(struct gv_softc *, struct g_provider *); +void gv_drive_lost(struct gv_softc *, struct gv_drive *); +void gv_setup_objects(struct gv_softc *); +void gv_start(struct bio *); +int gv_access(struct g_provider *, int, int, int); + +void gv_done(struct bio *); +void gv_volume_start(struct gv_softc *, struct bio *); + +void gv_bio_done(struct gv_softc *, struct bio *); +void gv_cleanup(struct gv_softc *); +int gv_create_drive(struct gv_softc *, struct gv_drive *); +int gv_create_volume(struct gv_softc *, struct gv_volume *); +int gv_create_plex(struct gv_softc *, struct gv_plex *); +int gv_create_sd(struct gv_softc *, struct gv_sd *); + +int gv_stripe_active(struct gv_plex *, struct bio *); + +/* geom_vinum_plex.c */ +void gv_plex_start(struct gv_plex *, struct bio *); +void gv_plex_raid5_done(struct gv_plex *, struct bio *); +void gv_plex_normal_done(struct gv_plex *, struct bio *); +int gv_grow_request(struct gv_plex *, off_t, off_t, int, caddr_t); +void gv_grow_complete(struct gv_plex *, struct bio *); +void gv_init_request(struct gv_sd *, off_t, caddr_t, off_t); +void gv_init_complete(struct gv_plex *, struct bio *); +void gv_parity_request(struct gv_plex *, int, off_t); +void gv_parity_complete(struct gv_plex *, struct bio *); +void gv_rebuild_complete(struct gv_plex *, struct bio *); +int gv_sync_request(struct gv_plex *, struct gv_plex *, off_t, off_t, int, + caddr_t); +int gv_sync_complete(struct gv_plex *, struct bio *); #endif /* !_GEOM_VINUM_H_ */ Index: sys/geom/vinum/geom_vinum_create.c =================================================================== RCS file: sys/geom/vinum/geom_vinum_create.c diff -N sys/geom/vinum/geom_vinum_create.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sys/geom/vinum/geom_vinum_create.c 3 Nov 2007 02:40:17 -0000 @@ -0,0 +1,611 @@ +/*- + * Copyright (c) 2007 Lukas Ertl + * Copyright (c) 2007 Ulf Lilleengen + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* + * Create a new drive object, either by user request, during taste of the drive + * itself, or because it was referenced by a subdisk during taste. + */ +int +gv_create_drive(struct gv_softc *sc, struct gv_drive *d) +{ + struct g_geom *gp; + struct g_provider *pp; + struct g_consumer *cp, *cp2; + struct gv_drive *d2; + struct gv_hdr *hdr; + struct gv_freelist *fl; + + KASSERT(d != NULL, ("gv_create_drive: NULL d")); + + gp = sc->geom; + + pp = NULL; + cp = cp2 = NULL; + + /* The drive already has a consumer if it was tasted before. */ + if (d->consumer != NULL) { + cp = d->consumer; + cp->private = d; + pp = cp->provider; + } else if (!(d->flags & GV_DRIVE_REFERENCED)) { + if (gv_find_drive(sc, d->name) != NULL) { + printf("VINUM: drive '%s' already exists\n", d->name); + g_free(d); + return (GV_ERR_CREATE); + } + + if (gv_find_drive_device(sc, d->device) != NULL) { + printf("VINUM: provider '%s' already in use by " + "gvinum\n", d->device); + return (GV_ERR_CREATE); + } + + pp = g_provider_by_name(d->device); + if (pp == NULL) { + printf("VINUM: create '%s': device '%s' disappeared?\n", + d->name, d->device); + g_free(d); + return (GV_ERR_CREATE); + } + + g_topology_lock(); + cp = g_new_consumer(gp); + if (g_attach(cp, pp) != 0) { + g_destroy_consumer(cp); + g_topology_unlock(); + printf("VINUM: create drive '%s': couldn't attach\n", + d->name); + g_free(d); + return (GV_ERR_CREATE); + } + g_topology_unlock(); + + d->consumer = cp; + cp->private = d; + } + + /* + * If this was just a "referenced" drive, we're almost finished, but + * insert this drive not on the head of the drives list, as + * gv_drive_is_newer() expects a "real" drive from LIST_FIRST(). + */ + if (d->flags & GV_DRIVE_REFERENCED) { + snprintf(d->device, GV_MAXDRIVENAME, "???"); + d2 = LIST_FIRST(&sc->drives); + if (d2 == NULL) + LIST_INSERT_HEAD(&sc->drives, d, drive); + else + LIST_INSERT_AFTER(d2, d, drive); + return (0); + } + + /* + * Update access counts of the new drive to those of an already + * existing drive. + */ + LIST_FOREACH(d2, &sc->drives, drive) { + if ((d == d2) || (d2->consumer == NULL)) + continue; + + cp2 = d2->consumer; + g_topology_lock(); + if ((cp2->acr || cp2->acw || cp2->ace) && + (g_access(cp, cp2->acr, cp2->acw, cp2->ace) != 0)) { + g_detach(cp); + g_destroy_consumer(cp); + g_topology_unlock(); + printf("VINUM: create drive '%s': couldn't update " + "access counts\n", d->name); + if (d->hdr != NULL) + g_free(d->hdr); + g_free(d); + return (GV_ERR_CREATE); + } + g_topology_unlock(); + break; + } + + d->size = pp->mediasize - GV_DATA_START; + d->avail = d->size; + d->vinumconf = sc; + LIST_INIT(&d->subdisks); + LIST_INIT(&d->freelist); + + /* The header might have been set during taste. */ + if (d->hdr == NULL) { + hdr = g_malloc(sizeof(*hdr), M_WAITOK | M_ZERO); + hdr->magic = GV_MAGIC; + hdr->config_length = GV_CFG_LEN; + bcopy(hostname, hdr->label.sysname, GV_HOSTNAME_LEN); + strncpy(hdr->label.name, d->name, GV_MAXDRIVENAME); + microtime(&hdr->label.date_of_birth); + d->hdr = hdr; + } + + /* We also need a freelist entry. */ + fl = g_malloc(sizeof(struct gv_freelist), M_WAITOK | M_ZERO); + fl->offset = GV_DATA_START; + fl->size = d->avail; + LIST_INSERT_HEAD(&d->freelist, fl, freelist); + d->freelist_entries = 1; + + if (gv_find_drive(sc, d->name) == NULL) + LIST_INSERT_HEAD(&sc->drives, d, drive); + + gv_set_drive_state(d, GV_DRIVE_UP, 0); + return (0); +} + +int +gv_create_volume(struct gv_softc *sc, struct gv_volume *v) +{ + KASSERT(v != NULL, ("gv_create_volume: NULL v")); + + v->vinumconf = sc; + v->flags |= GV_VOL_NEWBORN; + LIST_INIT(&v->plexes); + LIST_INSERT_HEAD(&sc->volumes, v, volume); + v->wqueue = g_malloc(sizeof(struct bio_queue_head), M_WAITOK | M_ZERO); + bioq_init(v->wqueue); + return (0); +} + +int +gv_create_plex(struct gv_softc *sc, struct gv_plex *p) +{ + struct gv_volume *v; + + KASSERT(p != NULL, ("gv_create_plex: NULL p")); + + /* Find the volume this plex should be attached to. */ + v = gv_find_vol(sc, p->volume); + if (v == NULL) { + printf("VINUM: create plex '%s': volume '%s' not found\n", + p->name, p->volume); + g_free(p); + return (GV_ERR_CREATE); + } + if (!(v->flags & GV_VOL_NEWBORN)) + p->flags |= GV_PLEX_ADDED; + p->vol_sc = v; + v->plexcount++; + p->vinumconf = sc; + p->synced = 0; + p->flags |= GV_PLEX_NEWBORN; + LIST_INSERT_HEAD(&v->plexes, p, in_volume); + LIST_INIT(&p->subdisks); + TAILQ_INIT(&p->packets); + LIST_INSERT_HEAD(&sc->plexes, p, plex); + p->bqueue = g_malloc(sizeof(struct bio_queue_head), M_WAITOK | M_ZERO); + bioq_init(p->bqueue); + p->wqueue = g_malloc(sizeof(struct bio_queue_head), M_WAITOK | M_ZERO); + bioq_init(p->wqueue); + p->rqueue = g_malloc(sizeof(struct bio_queue_head), M_WAITOK | M_ZERO); + bioq_init(p->rqueue); + return (0); +} + +int +gv_create_sd(struct gv_softc *sc, struct gv_sd *s) +{ + struct gv_plex *p; + struct gv_drive *d; + + KASSERT(s != NULL, ("gv_create_sd: NULL s")); + + /* Find the drive where this subdisk should be put on. */ + d = gv_find_drive(sc, s->drive); + if (d == NULL) { + /* + * It's possible that the subdisk references a drive that + * doesn't exist yet (during the taste process), so create a + * practically empty "referenced" drive. + */ + if (s->flags & GV_SD_TASTED) { + d = g_malloc(sizeof(struct gv_drive), + M_WAITOK | M_ZERO); + d->flags |= GV_DRIVE_REFERENCED; + strncpy(d->name, s->drive, GV_MAXDRIVENAME); + gv_create_drive(sc, d); + } else { + printf("VINUM: create sd '%s': drive '%s' not found\n", + s->name, s->drive); + g_free(s); + return (GV_ERR_CREATE); + } + } + + /* Find the plex where this subdisk belongs to. */ + p = gv_find_plex(sc, s->plex); + if (p == NULL) { + printf("VINUM: create sd '%s': plex '%s' not found\n", + s->name, s->plex); + g_free(s); + return (GV_ERR_CREATE); + } + +/* if (p->org == GV_PLEX_RAID5 && p->state == GV_PLEX_DEGRADED) { + printf("VINUM: can't add subdisk to %s, rebuild plex before " + " adding subdisks\n", p->name); + g_free(s); + return (0); + }*/ + + /* + * First we give the subdisk to the drive, to handle autosized + * values ... + */ + if (gv_sd_to_drive(s, d) != 0) { + g_free(s); + return (GV_ERR_CREATE); + } + + /* + * Then, we give the subdisk to the plex; we check if the + * given values are correct and maybe adjust them. + */ + if (gv_sd_to_plex(s, p) != 0) { + printf("VINUM: couldn't give sd '%s' to plex '%s'\n", + s->name, p->name); + if (s->drive_sc && !(s->drive_sc->flags & GV_DRIVE_REFERENCED)) + LIST_REMOVE(s, from_drive); + gv_free_sd(s); + g_free(s); + /* + * If this subdisk can't be created, we won't create + * the attached plex either, if it is also a new one. + */ + if (!(p->flags & GV_PLEX_NEWBORN)) + return (GV_ERR_CREATE); + gv_rm_plex(sc, p); + return (GV_ERR_CREATE); + } + s->flags |= GV_SD_NEWBORN; + + s->vinumconf = sc; + LIST_INSERT_HEAD(&sc->subdisks, s, sd); + + return (0); +} + +/* + * Create a concatenated volume from specified drives or drivegroups. + */ +void +gv_concat(struct g_geom *gp, struct gctl_req *req) +{ + struct gv_drive *d; + struct gv_sd *s; + struct gv_volume *v; + struct gv_plex *p; + struct gv_softc *sc; + char *drive, buf[30], *vol; + int *drives, *flags, dcount; + + sc = gp->softc; + dcount = 0; + vol = gctl_get_param(req, "name", NULL); + if (vol == NULL) { + gctl_error(req, "volume's not given"); + return; + } + + flags = gctl_get_paraml(req, "flags", sizeof(*flags)); + drives = gctl_get_paraml(req, "drives", sizeof(*drives)); + + if (drives == NULL) { + gctl_error(req, "drives not given"); + return; + } + + /* First we create the volume. */ + v = g_malloc(sizeof(*v), M_WAITOK | M_ZERO); + strlcpy(v->name, vol, GV_MAXVOLNAME); + v->state = GV_VOL_UP; + gv_post_event(sc, GV_EVENT_CREATE_VOLUME, v, NULL, 0, 0); + + /* Then we create the plex. */ + p = g_malloc(sizeof(*p), M_WAITOK | M_ZERO); + snprintf(p->name, sizeof(p->name), "%s.p%d", v->name, v->plexcount); + strlcpy(p->volume, v->name, GV_MAXVOLNAME); + p->org = GV_PLEX_CONCAT; + p->stripesize = 0; + gv_post_event(sc, GV_EVENT_CREATE_PLEX, p, NULL, 0, 0); + + /* Drives are first (right now) priority */ + for (dcount = 0; dcount < *drives; dcount++) { + snprintf(buf, sizeof(buf), "drive%d", dcount); + drive = gctl_get_param(req, buf, NULL); + d = gv_find_drive(sc, drive); + if (d == NULL) { + gctl_error(req, "No such drive '%s'", drive); + continue; + } + s = g_malloc(sizeof(*s), M_WAITOK | M_ZERO); + snprintf(s->name, sizeof(s->name), "%s.s%d", p->name, dcount); + strlcpy(s->plex, p->name, GV_MAXPLEXNAME); + strlcpy(s->drive, drive, GV_MAXDRIVENAME); + s->plex_offset = -1; + s->drive_offset = -1; + s->size = -1; + gv_post_event(sc, GV_EVENT_CREATE_SD, s, NULL, 0, 0); + } + gv_post_event(sc, GV_EVENT_SETUP_OBJECTS, sc, NULL, 0, 0); + gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); +} + +/* + * Create a mirrored volume from specified drives or drivegroups. + */ +void +gv_mirror(struct g_geom *gp, struct gctl_req *req) +{ + struct gv_drive *d; + struct gv_sd *s; + struct gv_volume *v; + struct gv_plex *p; + struct gv_softc *sc; + char *drive, buf[30], *vol; + int *drives, *flags, dcount, pcount, scount; + + sc = gp->softc; + dcount = 0; + scount = 0; + pcount = 0; + vol = gctl_get_param(req, "name", NULL); + if (vol == NULL) { + gctl_error(req, "volume's not given"); + return; + } + + flags = gctl_get_paraml(req, "flags", sizeof(*flags)); + drives = gctl_get_paraml(req, "drives", sizeof(*drives)); + + if (drives == NULL) { + gctl_error(req, "drives not given"); + return; + } + + /* We must have an even number of drives. */ + if (*drives % 2 != 0) { + gctl_error(req, "must have an even number of drives"); + return; + } + if (*flags & GV_FLAG_S && *drives < 4) { + gctl_error(req, "must have at least 4 drives for striped plex"); + return; + } + + /* First we create the volume. */ + v = g_malloc(sizeof(*v), M_WAITOK | M_ZERO); + strlcpy(v->name, vol, GV_MAXVOLNAME); + v->state = GV_VOL_UP; + gv_post_event(sc, GV_EVENT_CREATE_VOLUME, v, NULL, 0, 0); + + /* Then we create the plexes. */ + for (pcount = 0; pcount < 2; pcount++) { + p = g_malloc(sizeof(*p), M_WAITOK | M_ZERO); + snprintf(p->name, sizeof(p->name), "%s.p%d", v->name, + pcount); + strlcpy(p->volume, v->name, GV_MAXVOLNAME); + if (*flags & GV_FLAG_S) { + p->org = GV_PLEX_STRIPED; + p->stripesize = 262144; /*XXX: DFLT_STRIPESIZE? */ + } else { + p->org = GV_PLEX_CONCAT; + p->stripesize = -1; + } + gv_post_event(sc, GV_EVENT_CREATE_PLEX, p, NULL, 0, 0); + + /* We just gives each even drive to plex one, and each odd to + * plex two. */ + scount = 0; + for (dcount = pcount; dcount < *drives; dcount += 2) { + snprintf(buf, sizeof(buf), "drive%d", dcount); + drive = gctl_get_param(req, buf, NULL); + d = gv_find_drive(sc, drive); + if (d == NULL) { + gctl_error(req, "No such drive '%s'", drive); + /* XXX: Should we fail instead? */ + scount++; + continue; + } + s = g_malloc(sizeof(*s), M_WAITOK | M_ZERO); + snprintf(s->name, sizeof(s->name), "%s.s%d", p->name, + scount); + strlcpy(s->plex, p->name, GV_MAXPLEXNAME); + strlcpy(s->drive, drive, GV_MAXDRIVENAME); + s->plex_offset = -1; + s->drive_offset = -1; + s->size = -1; + gv_post_event(sc, GV_EVENT_CREATE_SD, s, NULL, 0, 0); + scount++; + } + } + gv_post_event(sc, GV_EVENT_SETUP_OBJECTS, sc, NULL, 0, 0); + gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); +} + +void +gv_raid5(struct g_geom *gp, struct gctl_req *req) +{ + struct gv_softc *sc; + struct gv_drive *d; + struct gv_volume *v; + struct gv_plex *p; + struct gv_sd *s; + int *drives, *flags, dcount; + char *vol, *drive, buf[30]; + off_t *stripesize; + + dcount = 0; + sc = gp->softc; + + vol = gctl_get_param(req, "name", NULL); + if (vol == NULL) { + gctl_error(req, "volume's not given"); + return; + } + flags = gctl_get_paraml(req, "flags", sizeof(*flags)); + drives = gctl_get_paraml(req, "drives", sizeof(*drives)); + stripesize = gctl_get_paraml(req, "stripesize", sizeof(*stripesize)); + + if (stripesize == NULL) { + gctl_error(req, "no stripesize given"); + return; + } + + if (drives == NULL) { + gctl_error(req, "drives not given"); + return; + } + + /* We must have at least three drives. */ + if (*drives < 3) { + gctl_error(req, "must have at least three drives for this " + "plex organisation"); + return; + } + /* First we create the volume. */ + v = g_malloc(sizeof(*v), M_WAITOK | M_ZERO); + strlcpy(v->name, vol, GV_MAXVOLNAME); + v->state = GV_VOL_UP; + gv_post_event(sc, GV_EVENT_CREATE_VOLUME, v, NULL, 0, 0); + + /* Then we create the plex. */ + p = g_malloc(sizeof(*p), M_WAITOK | M_ZERO); + snprintf(p->name, sizeof(p->name), "%s.p%d", v->name, v->plexcount); + strlcpy(p->volume, v->name, GV_MAXVOLNAME); + p->org = GV_PLEX_RAID5; + p->stripesize = *stripesize; + gv_post_event(sc, GV_EVENT_CREATE_PLEX, p, NULL, 0, 0); + + /* Create subdisks on drives. */ + for (dcount = 0; dcount < *drives; dcount++) { + snprintf(buf, sizeof(buf), "drive%d", dcount); + drive = gctl_get_param(req, buf, NULL); + d = gv_find_drive(sc, drive); + if (d == NULL) { + gctl_error(req, "No such drive '%s'", drive); + continue; + } + s = g_malloc(sizeof(*s), M_WAITOK | M_ZERO); + snprintf(s->name, sizeof(s->name), "%s.s%d", p->name, dcount); + strlcpy(s->plex, p->name, GV_MAXPLEXNAME); + strlcpy(s->drive, drive, GV_MAXDRIVENAME); + s->plex_offset = -1; + s->drive_offset = -1; + s->size = -1; + gv_post_event(sc, GV_EVENT_CREATE_SD, s, NULL, 0, 0); + } + gv_post_event(sc, GV_EVENT_SETUP_OBJECTS, sc, NULL, 0, 0); + gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); +} + +/* + * Create a striped volume from specified drives or drivegroups. + */ +void +gv_stripe(struct g_geom *gp, struct gctl_req *req) +{ + struct gv_drive *d; + struct gv_sd *s; + struct gv_volume *v; + struct gv_plex *p; + struct gv_softc *sc; + char *drive, buf[30], *vol; + int *drives, *flags, dcount, pcount; + + sc = gp->softc; + dcount = 0; + pcount = 0; + vol = gctl_get_param(req, "name", NULL); + if (vol == NULL) { + gctl_error(req, "volume's not given"); + return; + } + flags = gctl_get_paraml(req, "flags", sizeof(*flags)); + drives = gctl_get_paraml(req, "drives", sizeof(*drives)); + + if (drives == NULL) { + gctl_error(req, "drives not given"); + return; + } + + /* We must have at least two drives. */ + if (*drives < 2) { + gctl_error(req, "must have at least 2 drives"); + return; + } + + /* First we create the volume. */ + v = g_malloc(sizeof(*v), M_WAITOK | M_ZERO); + strlcpy(v->name, vol, GV_MAXVOLNAME); + v->state = GV_VOL_UP; + gv_post_event(sc, GV_EVENT_CREATE_VOLUME, v, NULL, 0, 0); + + /* Then we create the plex. */ + p = g_malloc(sizeof(*p), M_WAITOK | M_ZERO); + snprintf(p->name, sizeof(p->name), "%s.p%d", v->name, v->plexcount); + strlcpy(p->volume, v->name, GV_MAXVOLNAME); + p->org = GV_PLEX_STRIPED; + p->stripesize = 262144; + gv_post_event(sc, GV_EVENT_CREATE_PLEX, p, NULL, 0, 0); + + /* Create subdisks on drives. */ + for (dcount = 0; dcount < *drives; dcount++) { + snprintf(buf, sizeof(buf), "drive%d", dcount); + drive = gctl_get_param(req, buf, NULL); + d = gv_find_drive(sc, drive); + if (d == NULL) { + gctl_error(req, "No such drive '%s'", drive); + continue; + } + s = g_malloc(sizeof(*s), M_WAITOK | M_ZERO); + snprintf(s->name, sizeof(s->name), "%s.s%d", p->name, dcount); + strlcpy(s->plex, p->name, GV_MAXPLEXNAME); + strlcpy(s->drive, drive, GV_MAXDRIVENAME); + s->plex_offset = -1; + s->drive_offset = -1; + s->size = -1; + gv_post_event(sc, GV_EVENT_CREATE_SD, s, NULL, 0, 0); + } + gv_post_event(sc, GV_EVENT_SETUP_OBJECTS, sc, NULL, 0, 0); + gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); +} Index: sys/geom/vinum/geom_vinum_drive.c =================================================================== RCS file: /srv/ncvs/src/sys/geom/vinum/geom_vinum_drive.c,v retrieving revision 1.25 diff -u -u -r1.25 geom_vinum_drive.c --- sys/geom/vinum/geom_vinum_drive.c 6 Jan 2006 18:03:17 -0000 1.25 +++ sys/geom/vinum/geom_vinum_drive.c 3 Nov 2007 02:40:17 -0000 @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2004, 2005 Lukas Ertl + * Copyright (c) 2004, 2005, 2007 Lukas Ertl * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -28,639 +28,103 @@ __FBSDID("$FreeBSD: src/sys/geom/vinum/geom_vinum_drive.c,v 1.25 2006/01/06 18:03:17 le Exp $"); #include -#include -#include -#include -#include -#include -#include #include #include -#include -#include -#include #include -#include #include #include #include -#include - -static void gv_drive_dead(void *, int); -static void gv_drive_worker(void *); - -void -gv_config_new_drive(struct gv_drive *d) -{ - struct gv_hdr *vhdr; - struct gv_freelist *fl; - - KASSERT(d != NULL, ("config_new_drive: NULL d")); - - vhdr = g_malloc(sizeof(*vhdr), M_WAITOK | M_ZERO); - vhdr->magic = GV_MAGIC; - vhdr->config_length = GV_CFG_LEN; - - bcopy(hostname, vhdr->label.sysname, GV_HOSTNAME_LEN); - strncpy(vhdr->label.name, d->name, GV_MAXDRIVENAME); - microtime(&vhdr->label.date_of_birth); - - d->hdr = vhdr; - - LIST_INIT(&d->subdisks); - LIST_INIT(&d->freelist); - - fl = g_malloc(sizeof(struct gv_freelist), M_WAITOK | M_ZERO); - fl->offset = GV_DATA_START; - fl->size = d->avail; - LIST_INSERT_HEAD(&d->freelist, fl, freelist); - d->freelist_entries = 1; - - d->bqueue = g_malloc(sizeof(struct bio_queue_head), M_WAITOK | M_ZERO); - bioq_init(d->bqueue); - mtx_init(&d->bqueue_mtx, "gv_drive", NULL, MTX_DEF); - kthread_create(gv_drive_worker, d, NULL, 0, 0, "gv_d %s", d->name); - d->flags |= GV_DRIVE_THREAD_ACTIVE; -} +/* Save the vinum configuration back to each involved disk. */ void -gv_save_config_all(struct gv_softc *sc) +gv_save_config(struct gv_softc *sc) { + struct g_consumer *cp; struct gv_drive *d; - - g_topology_assert(); - - LIST_FOREACH(d, &sc->drives, drive) { - if (d->geom == NULL) - continue; - gv_save_config(NULL, d, sc); - } -} - -/* Save the vinum configuration back to disk. */ -void -gv_save_config(struct g_consumer *cp, struct gv_drive *d, struct gv_softc *sc) -{ - struct g_geom *gp; - struct g_consumer *cp2; struct gv_hdr *vhdr, *hdr; struct sbuf *sb; + struct timeval last_update; int error; - g_topology_assert(); - - KASSERT(d != NULL, ("gv_save_config: null d")); KASSERT(sc != NULL, ("gv_save_config: null sc")); - /* - * We can't save the config on a drive that isn't up, but drives that - * were just created aren't officially up yet, so we check a special - * flag. - */ - if ((d->state != GV_DRIVE_UP) && !(d->flags && GV_DRIVE_NEWBORN)) - return; - - if (cp == NULL) { - gp = d->geom; - KASSERT(gp != NULL, ("gv_save_config: null gp")); - cp2 = LIST_FIRST(&gp->consumer); - KASSERT(cp2 != NULL, ("gv_save_config: null cp2")); - } else - cp2 = cp; - vhdr = g_malloc(GV_HDR_LEN, M_WAITOK | M_ZERO); vhdr->magic = GV_MAGIC; vhdr->config_length = GV_CFG_LEN; - - hdr = d->hdr; - if (hdr == NULL) { - printf("GEOM_VINUM: drive %s has NULL hdr\n", d->name); - g_free(vhdr); - return; - } - microtime(&hdr->label.last_update); - bcopy(&hdr->label, &vhdr->label, sizeof(struct gv_label)); + microtime(&last_update); sb = sbuf_new(NULL, NULL, GV_CFG_LEN, SBUF_FIXEDLEN); gv_format_config(sc, sb, 1, NULL); sbuf_finish(sb); - error = g_access(cp2, 0, 1, 0); - if (error) { - printf("GEOM_VINUM: g_access failed on drive %s, errno %d\n", - d->name, error); - sbuf_delete(sb); - g_free(vhdr); - return; - } - g_topology_unlock(); - - do { - error = g_write_data(cp2, GV_HDR_OFFSET, vhdr, GV_HDR_LEN); - if (error) { - printf("GEOM_VINUM: writing vhdr failed on drive %s, " - "errno %d", d->name, error); - break; - } - - error = g_write_data(cp2, GV_CFG_OFFSET, sbuf_data(sb), - GV_CFG_LEN); - if (error) { - printf("GEOM_VINUM: writing first config copy failed " - "on drive %s, errno %d", d->name, error); - break; - } - - error = g_write_data(cp2, GV_CFG_OFFSET + GV_CFG_LEN, - sbuf_data(sb), GV_CFG_LEN); - if (error) - printf("GEOM_VINUM: writing second config copy failed " - "on drive %s, errno %d", d->name, error); - } while (0); - - g_topology_lock(); - g_access(cp2, 0, -1, 0); - sbuf_delete(sb); - g_free(vhdr); - - if (d->geom != NULL) - gv_drive_modify(d); -} - -/* This resembles g_slice_access(). */ -static int -gv_drive_access(struct g_provider *pp, int dr, int dw, int de) -{ - struct g_geom *gp; - struct g_consumer *cp; - struct g_provider *pp2; - struct gv_drive *d; - struct gv_sd *s, *s2; - int error; - - gp = pp->geom; - cp = LIST_FIRST(&gp->consumer); - if (cp == NULL) - return (0); - - d = gp->softc; - if (d == NULL) - return (0); - - s = pp->private; - KASSERT(s != NULL, ("gv_drive_access: NULL s")); - - LIST_FOREACH(s2, &d->subdisks, from_drive) { - if (s == s2) - continue; - if (s->drive_offset + s->size <= s2->drive_offset) - continue; - if (s2->drive_offset + s2->size <= s->drive_offset) + LIST_FOREACH(d, &sc->drives, drive) { + /* + * We can't save the config on a drive that isn't up, but + * drives that were just created aren't officially up yet, so + * we check a special flag. + */ + if (d->state != GV_DRIVE_UP) continue; - /* Overlap. */ - pp2 = s2->provider; - KASSERT(s2 != NULL, ("gv_drive_access: NULL s2")); - if ((pp->acw + dw) > 0 && pp2->ace > 0) - return (EPERM); - if ((pp->ace + de) > 0 && pp2->acw > 0) - return (EPERM); - } - - error = g_access(cp, dr, dw, de); - return (error); -} - -static void -gv_drive_done(struct bio *bp) -{ - struct gv_drive *d; - - /* Put the BIO on the worker queue again. */ - d = bp->bio_from->geom->softc; - bp->bio_cflags |= GV_BIO_DONE; - mtx_lock(&d->bqueue_mtx); - bioq_insert_tail(d->bqueue, bp); - wakeup(d); - mtx_unlock(&d->bqueue_mtx); -} - - -static void -gv_drive_start(struct bio *bp) -{ - struct gv_drive *d; - struct gv_sd *s; - - switch (bp->bio_cmd) { - case BIO_READ: - case BIO_WRITE: - case BIO_DELETE: - break; - case BIO_GETATTR: - default: - g_io_deliver(bp, EOPNOTSUPP); - return; - } - - s = bp->bio_to->private; - if ((s->state == GV_SD_DOWN) || (s->state == GV_SD_STALE)) { - g_io_deliver(bp, ENXIO); - return; - } - - d = bp->bio_to->geom->softc; - - /* - * Put the BIO on the worker queue, where the worker thread will pick - * it up. - */ - mtx_lock(&d->bqueue_mtx); - bioq_disksort(d->bqueue, bp); - wakeup(d); - mtx_unlock(&d->bqueue_mtx); - -} - -static void -gv_drive_worker(void *arg) -{ - struct bio *bp, *cbp; - struct g_geom *gp; - struct g_provider *pp; - struct gv_drive *d; - struct gv_sd *s; - int error; - - d = arg; - - mtx_lock(&d->bqueue_mtx); - for (;;) { - /* We were signaled to exit. */ - if (d->flags & GV_DRIVE_THREAD_DIE) - break; - - /* Take the first BIO from out queue. */ - bp = bioq_takefirst(d->bqueue); - if (bp == NULL) { - msleep(d, &d->bqueue_mtx, PRIBIO, "-", hz/10); + cp = d->consumer; + if (cp == NULL) { + printf("VINUM: save config: drive '%s' has no " + "consumer!\n", d->name); continue; - } - mtx_unlock(&d->bqueue_mtx); - - pp = bp->bio_to; - gp = pp->geom; - - /* Completed request. */ - if (bp->bio_cflags & GV_BIO_DONE) { - error = bp->bio_error; - - /* Deliver the original request. */ - g_std_done(bp); - - /* The request had an error, we need to clean up. */ - if (error != 0) { - g_topology_lock(); - gv_set_drive_state(d, GV_DRIVE_DOWN, - GV_SETSTATE_FORCE | GV_SETSTATE_CONFIG); - g_topology_unlock(); - g_post_event(gv_drive_dead, d, M_WAITOK, d, - NULL); - } - - /* New request, needs to be sent downwards. */ - } else { - s = pp->private; - - if ((s->state == GV_SD_DOWN) || - (s->state == GV_SD_STALE)) { - g_io_deliver(bp, ENXIO); - mtx_lock(&d->bqueue_mtx); - continue; - } - if (bp->bio_offset > s->size) { - g_io_deliver(bp, EINVAL); - mtx_lock(&d->bqueue_mtx); - continue; - } - - cbp = g_clone_bio(bp); - if (cbp == NULL) { - g_io_deliver(bp, ENOMEM); - mtx_lock(&d->bqueue_mtx); - continue; - } - if (cbp->bio_offset + cbp->bio_length > s->size) - cbp->bio_length = s->size - - cbp->bio_offset; - cbp->bio_done = gv_drive_done; - cbp->bio_offset += s->drive_offset; - g_io_request(cbp, LIST_FIRST(&gp->consumer)); } - mtx_lock(&d->bqueue_mtx); - } - - while ((bp = bioq_takefirst(d->bqueue)) != NULL) { - mtx_unlock(&d->bqueue_mtx); - if (bp->bio_cflags & GV_BIO_DONE) - g_std_done(bp); - else - g_io_deliver(bp, ENXIO); - mtx_lock(&d->bqueue_mtx); - } - mtx_unlock(&d->bqueue_mtx); - d->flags |= GV_DRIVE_THREAD_DEAD; - - kthread_exit(ENXIO); -} - - -static void -gv_drive_orphan(struct g_consumer *cp) -{ - struct g_geom *gp; - struct gv_drive *d; - - g_topology_assert(); - gp = cp->geom; - g_trace(G_T_TOPOLOGY, "gv_drive_orphan(%s)", gp->name); - d = gp->softc; - if (d != NULL) { - gv_set_drive_state(d, GV_DRIVE_DOWN, - GV_SETSTATE_FORCE | GV_SETSTATE_CONFIG); - g_post_event(gv_drive_dead, d, M_WAITOK, d, NULL); - } else - g_wither_geom(gp, ENXIO); -} - -static struct g_geom * -gv_drive_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) -{ - struct g_geom *gp, *gp2; - struct g_consumer *cp; - struct gv_drive *d; - struct gv_sd *s; - struct gv_softc *sc; - struct gv_freelist *fl; - struct gv_hdr *vhdr; - int error; - char *buf, errstr[ERRBUFSIZ]; - - vhdr = NULL; - d = NULL; - - g_trace(G_T_TOPOLOGY, "gv_drive_taste(%s, %s)", mp->name, pp->name); - g_topology_assert(); - - /* Find the VINUM class and its associated geom. */ - gp2 = find_vinum_geom(); - if (gp2 == NULL) - return (NULL); - sc = gp2->softc; - - gp = g_new_geomf(mp, "%s.vinumdrive", pp->name); - gp->start = gv_drive_start; - gp->orphan = gv_drive_orphan; - gp->access = gv_drive_access; - gp->start = gv_drive_start; - - cp = g_new_consumer(gp); - g_attach(cp, pp); - error = g_access(cp, 1, 0, 0); - if (error) { - g_detach(cp); - g_destroy_consumer(cp); - g_destroy_geom(gp); - return (NULL); - } - - g_topology_unlock(); - - /* Now check if the provided slice is a valid vinum drive. */ - do { - vhdr = g_read_data(cp, GV_HDR_OFFSET, pp->sectorsize, NULL); - if (vhdr == NULL) - break; - if (vhdr->magic != GV_MAGIC) { + hdr = d->hdr; + if (hdr == NULL) { + printf("VINUM: drive '%s' has no header\n", d->name); g_free(vhdr); - break; - } - - /* A valid vinum drive, let's parse the on-disk information. */ - buf = g_read_data(cp, GV_CFG_OFFSET, GV_CFG_LEN, NULL); - if (buf == NULL) { - g_free(vhdr); - break; + continue; } + bcopy(&last_update, &hdr->label.last_update, + sizeof(struct timeval)); + bcopy(&hdr->label, &vhdr->label, sizeof(struct gv_label)); g_topology_lock(); - gv_parse_config(sc, buf, 1); - g_free(buf); - - /* - * Let's see if this drive is already known in the - * configuration. - */ - d = gv_find_drive(sc, vhdr->label.name); - - /* We already know about this drive. */ - if (d != NULL) { - /* Check if this drive already has a geom. */ - if (d->geom != NULL) { - g_topology_unlock(); - break; - } - bcopy(vhdr, d->hdr, sizeof(*vhdr)); - - /* This is a new drive. */ - } else { - d = g_malloc(sizeof(*d), M_WAITOK | M_ZERO); - - /* Initialize all needed variables. */ - d->size = pp->mediasize - GV_DATA_START; - d->avail = d->size; - d->hdr = vhdr; - strncpy(d->name, vhdr->label.name, GV_MAXDRIVENAME); - LIST_INIT(&d->subdisks); - LIST_INIT(&d->freelist); - - /* We also need a freelist entry. */ - fl = g_malloc(sizeof(*fl), M_WAITOK | M_ZERO); - fl->offset = GV_DATA_START; - fl->size = d->avail; - LIST_INSERT_HEAD(&d->freelist, fl, freelist); - d->freelist_entries = 1; - - /* Save it into the main configuration. */ - LIST_INSERT_HEAD(&sc->drives, d, drive); - } - - /* - * Create bio queue, queue mutex and a worker thread, if - * necessary. - */ - if (d->bqueue == NULL) { - d->bqueue = g_malloc(sizeof(struct bio_queue_head), - M_WAITOK | M_ZERO); - bioq_init(d->bqueue); - } - if (mtx_initialized(&d->bqueue_mtx) == 0) - mtx_init(&d->bqueue_mtx, "gv_drive", NULL, MTX_DEF); - - if (!(d->flags & GV_DRIVE_THREAD_ACTIVE)) { - kthread_create(gv_drive_worker, d, NULL, 0, 0, - "gv_d %s", d->name); - d->flags |= GV_DRIVE_THREAD_ACTIVE; + error = g_access(cp, 0, 1, 0); + if (error) { + printf("VINUM: gv_save_config/g_access failed on " + "drive %s, errno %d\n", d->name, error); + g_topology_unlock(); + continue; } + g_topology_unlock(); - g_access(cp, -1, 0, 0); - - gp->softc = d; - d->geom = gp; - d->vinumconf = sc; - strncpy(d->device, pp->name, GV_MAXDRIVENAME); - - /* - * Find out which subdisks belong to this drive and crosslink - * them. - */ - LIST_FOREACH(s, &sc->subdisks, sd) { - if (!strncmp(s->drive, d->name, GV_MAXDRIVENAME)) - /* XXX: errors ignored */ - gv_sd_to_drive(sc, d, s, errstr, - sizeof(errstr)); + error = g_write_data(cp, GV_HDR_OFFSET, vhdr, GV_HDR_LEN); + if (error) { + printf("VINUM: writing vhdr failed on drive %s, " + "errno %d\n", d->name, error); + g_topology_lock(); + g_access(cp, 0, -1, 0); + g_topology_unlock(); + continue; } - - /* This drive is now up for sure. */ - gv_set_drive_state(d, GV_DRIVE_UP, 0); - - /* - * If there are subdisks on this drive, we need to create - * providers for them. - */ - if (d->sdcount) - gv_drive_modify(d); - - return (gp); - - } while (0); - - g_topology_lock(); - g_access(cp, -1, 0, 0); - - g_detach(cp); - g_destroy_consumer(cp); - g_destroy_geom(gp); - return (NULL); -} - -/* - * Modify the providers for the given drive 'd'. It is assumed that the - * subdisk list of 'd' is already correctly set up. - */ -void -gv_drive_modify(struct gv_drive *d) -{ - struct g_geom *gp; - struct g_consumer *cp; - struct g_provider *pp, *pp2; - struct gv_sd *s; - - KASSERT(d != NULL, ("gv_drive_modify: null d")); - gp = d->geom; - KASSERT(gp != NULL, ("gv_drive_modify: null gp")); - cp = LIST_FIRST(&gp->consumer); - KASSERT(cp != NULL, ("gv_drive_modify: null cp")); - pp = cp->provider; - KASSERT(pp != NULL, ("gv_drive_modify: null pp")); - - g_topology_assert(); - - LIST_FOREACH(s, &d->subdisks, from_drive) { - /* This subdisk already has a provider. */ - if (s->provider != NULL) + /* First config copy. */ + error = g_write_data(cp, GV_CFG_OFFSET, sbuf_data(sb), + GV_CFG_LEN); + if (error) { + printf("VINUM: writing first config copy failed on " + "drive %s, errno %d\n", d->name, error); + g_topology_lock(); + g_access(cp, 0, -1, 0); + g_topology_unlock(); continue; - pp2 = g_new_providerf(gp, "gvinum/sd/%s", s->name); - pp2->mediasize = s->size; - pp2->sectorsize = pp->sectorsize; - g_error_provider(pp2, 0); - s->provider = pp2; - pp2->private = s; - } -} - -static void -gv_drive_dead(void *arg, int flag) -{ - struct g_geom *gp; - struct g_consumer *cp; - struct gv_drive *d; - struct gv_sd *s; - - g_topology_assert(); - KASSERT(arg != NULL, ("gv_drive_dead: NULL arg")); - - if (flag == EV_CANCEL) - return; - - d = arg; - if (d->state != GV_DRIVE_DOWN) - return; - - g_trace(G_T_TOPOLOGY, "gv_drive_dead(%s)", d->name); - - gp = d->geom; - if (gp == NULL) - return; - - LIST_FOREACH(cp, &gp->consumer, consumer) { - if (cp->nstart != cp->nend) { - printf("GEOM_VINUM: dead drive '%s' has still " - "active requests, can't detach consumer\n", - d->name); - g_post_event(gv_drive_dead, d, M_WAITOK, d, - NULL); - return; } - if (cp->acr != 0 || cp->acw != 0 || cp->ace != 0) - g_access(cp, -cp->acr, -cp->acw, -cp->ace); - } + /* Second config copy. */ + error = g_write_data(cp, GV_CFG_OFFSET + GV_CFG_LEN, + sbuf_data(sb), GV_CFG_LEN); + if (error) + printf("VINUM: writing second config copy failed on " + "drive %s, errno %d\n", d->name, error); - printf("GEOM_VINUM: lost drive '%s'\n", d->name); - d->geom = NULL; - LIST_FOREACH(s, &d->subdisks, from_drive) { - s->provider = NULL; - s->consumer = NULL; + g_topology_lock(); + g_access(cp, 0, -1, 0); + g_topology_unlock(); } - gv_kill_drive_thread(d); - gp->softc = NULL; - g_wither_geom(gp, ENXIO); -} -static int -gv_drive_destroy_geom(struct gctl_req *req, struct g_class *mp, - struct g_geom *gp) -{ - struct gv_drive *d; - - g_trace(G_T_TOPOLOGY, "gv_drive_destroy_geom: %s", gp->name); - g_topology_assert(); - - d = gp->softc; - gv_kill_drive_thread(d); - - g_wither_geom(gp, ENXIO); - return (0); + sbuf_delete(sb); + g_free(vhdr); } - -#define VINUMDRIVE_CLASS_NAME "VINUMDRIVE" - -static struct g_class g_vinum_drive_class = { - .name = VINUMDRIVE_CLASS_NAME, - .version = G_VERSION, - .taste = gv_drive_taste, - .destroy_geom = gv_drive_destroy_geom -}; - -DECLARE_GEOM_CLASS(g_vinum_drive_class, g_vinum_drive); Index: sys/geom/vinum/geom_vinum_events.c =================================================================== RCS file: sys/geom/vinum/geom_vinum_events.c diff -N sys/geom/vinum/geom_vinum_events.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sys/geom/vinum/geom_vinum_events.c 3 Nov 2007 02:40:17 -0000 @@ -0,0 +1,217 @@ +/*- + * Copyright (c) 2007 Lukas Ertl + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +void +gv_post_event(struct gv_softc *sc, int event, void *arg1, void *arg2, + intmax_t arg3, intmax_t arg4) +{ + struct gv_event *ev; + + ev = g_malloc(sizeof(*ev), M_WAITOK | M_ZERO); + ev->type = event; + ev->arg1 = arg1; + ev->arg2 = arg2; + ev->arg3 = arg3; + ev->arg4 = arg4; + + mtx_lock(&sc->queue_mtx); + TAILQ_INSERT_TAIL(&sc->equeue, ev, events); + wakeup(sc); + mtx_unlock(&sc->queue_mtx); +} + +void +gv_drive_tasted(struct gv_softc *sc, struct g_provider *pp) +{ + struct g_geom *gp; + struct g_consumer *cp; + struct gv_hdr *hdr; + struct gv_drive *d; + char *buf; + + hdr = NULL; + buf = NULL; + + printf("DEBUG: tasted drive on '%s'\n", pp->name); + + gp = sc->geom; + g_topology_lock(); + cp = g_new_consumer(gp); + if (g_attach(cp, pp) != 0) { + g_destroy_consumer(cp); + g_topology_unlock(); + printf("VINUM: failed to attach to provider on taste event\n"); + return; + } + if (g_access(cp, 1, 0, 0) != 0) { + g_detach(cp); + g_destroy_consumer(cp); + g_topology_unlock(); + printf("VINUM: failed to access consumer on taste event\n"); + return; + } + g_topology_unlock(); + + /* Read header and on-disk configuration. */ + hdr = g_read_data(cp, GV_HDR_OFFSET, pp->sectorsize, NULL); + if (hdr == NULL) { + printf("VINUM: failed to read header during taste\n"); + goto failed; + } + + /* + * Setup the drive before we parse the on-disk configuration, so that + * we already know about the drive then. + */ + d = gv_find_drive(sc, hdr->label.name); + if (d == NULL) { + d = g_malloc(sizeof(*d), M_WAITOK | M_ZERO); + strncpy(d->name, hdr->label.name, GV_MAXDRIVENAME); + strncpy(d->device, pp->name, GV_MAXDRIVENAME); + } else if (d->flags & GV_DRIVE_REFERENCED) { + strncpy(d->device, pp->name, GV_MAXDRIVENAME); + d->flags &= ~GV_DRIVE_REFERENCED; + } else { + printf("DEBUG: drive '%s' is already known\n", d->name); + g_free(hdr); + goto failed; + } + + /* Add the consumer and header to the new drive. */ + d->consumer = cp; + d->hdr = hdr; + gv_create_drive(sc, d); + + buf = g_read_data(cp, GV_CFG_OFFSET, GV_CFG_LEN, NULL); + if (buf == NULL) { + printf("VINUM: failed to read config during taste\n"); + g_free(hdr); + goto failed; + } + gv_parse_config(sc, buf, d); + g_free(buf); + + g_topology_lock(); + g_access(cp, -1, 0, 0); + g_topology_unlock(); + + gv_setup_objects(sc); + gv_set_drive_state(d, GV_DRIVE_UP, 0); + + /* XXX continue here! */ + + return; + +failed: + g_topology_lock(); + g_access(cp, -1, 0, 0); + g_detach(cp); + g_destroy_consumer(cp); + g_topology_unlock(); +} + +/* + * When losing a drive (e.g. hardware failure), we cut down the consumer + * attached to the underlying device and bring the drive itself to a + * "referenced" state so that normal tasting could bring it up cleanly if it + * possibly arrives again. + */ +void +gv_drive_lost(struct gv_softc *sc, struct gv_drive *d) +{ + struct g_consumer *cp; + struct gv_drive *d2; + struct gv_sd *s, *s2; + struct gv_freelist *fl, *fl2; + + gv_set_drive_state(d, GV_DRIVE_DOWN, + GV_SETSTATE_FORCE | GV_SETSTATE_CONFIG); + + cp = d->consumer; + + if (cp != NULL) { + if (cp->nstart != cp->nend) { + printf("VINUM: dead drive '%s' has still active " + "requests, can't detach consumer\n", d->name); + gv_post_event(sc, GV_EVENT_DRIVE_LOST, d, NULL, 0, 0); + return; + } + g_topology_lock(); + if (cp->acr != 0 || cp->acw != 0 || cp->ace != 0) + g_access(cp, -cp->acr, -cp->acw, -cp->ace); + g_detach(cp); + g_destroy_consumer(cp); + g_topology_unlock(); + } + + LIST_FOREACH_SAFE(fl, &d->freelist, freelist, fl2) { + LIST_REMOVE(fl, freelist); + g_free(fl); + } + + d->consumer = NULL; + g_free(d->hdr); + d->hdr = NULL; + d->flags |= GV_DRIVE_REFERENCED; + snprintf(d->device, GV_MAXDRIVENAME, "???"); + d->size = 0; + d->avail = 0; + d->freelist_entries = 0; + d->sdcount = 0; + + /* Put the subdisk in tasted mode, and remove from drive list. */ + LIST_FOREACH_SAFE(s, &d->subdisks, from_drive, s2) { + LIST_REMOVE(s, from_drive); + s->flags |= GV_SD_TASTED; + } + + /* + * Don't forget that gv_is_newer wants a "real" drive at the beginning + * of the list, so, just to be safe, we shuffle around. + */ + LIST_REMOVE(d, drive); + d2 = LIST_FIRST(&sc->drives); + if (d2 == NULL) + LIST_INSERT_HEAD(&sc->drives, d, drive); + else + LIST_INSERT_AFTER(d2, d, drive); + gv_save_config(sc); +} Index: sys/geom/vinum/geom_vinum_init.c =================================================================== RCS file: /srv/ncvs/src/sys/geom/vinum/geom_vinum_init.c,v retrieving revision 1.11 diff -u -u -r1.11 geom_vinum_init.c --- sys/geom/vinum/geom_vinum_init.c 28 Aug 2005 18:16:31 -0000 1.11 +++ sys/geom/vinum/geom_vinum_init.c 3 Nov 2007 02:40:17 -0000 @@ -1,5 +1,6 @@ /*- - * Copyright (c) 2004 Lukas Ertl + * Copyright (c) 2004, 2007 Lukas Ertl + * Copyright (c) 2007 Ulf Lilleengen * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -26,158 +27,19 @@ #include __FBSDID("$FreeBSD: src/sys/geom/vinum/geom_vinum_init.c,v 1.11 2005/08/28 18:16:31 le Exp $"); - #include #include -#include -#include #include #include -#include #include #include #include -#include -static int gv_init_plex(struct gv_plex *); -void gv_init_td(void *); -static int gv_rebuild_plex(struct gv_plex *); -void gv_rebuild_td(void *); -static int gv_start_plex(struct gv_plex *); -static int gv_start_vol(struct gv_volume *); static int gv_sync(struct gv_volume *); -void gv_sync_td(void *); - -struct gv_sync_args { - struct gv_volume *v; - struct gv_plex *from; - struct gv_plex *to; - off_t syncsize; -}; - -void -gv_parityop(struct g_geom *gp, struct gctl_req *req) -{ - struct gv_softc *sc; - struct gv_plex *p; - struct bio *bp; - struct g_consumer *cp; - int error, *flags, type, *rebuild, rv; - char *plex; - - rv = -1; - - plex = gctl_get_param(req, "plex", NULL); - if (plex == NULL) { - gctl_error(req, "no plex given"); - goto out; - } - - flags = gctl_get_paraml(req, "flags", sizeof(*flags)); - if (flags == NULL) { - gctl_error(req, "no flags given"); - goto out; - } - - rebuild = gctl_get_paraml(req, "rebuild", sizeof(*rebuild)); - if (rebuild == NULL) { - gctl_error(req, "no rebuild op given"); - goto out; - } - - sc = gp->softc; - type = gv_object_type(sc, plex); - switch (type) { - case GV_TYPE_PLEX: - break; - case GV_TYPE_VOL: - case GV_TYPE_SD: - case GV_TYPE_DRIVE: - default: - gctl_error(req, "'%s' is not a plex", plex); - goto out; - } - - p = gv_find_plex(sc, plex); - if (p->state != GV_PLEX_UP) { - gctl_error(req, "plex %s is not completely accessible", - p->name); - goto out; - } - if (p->org != GV_PLEX_RAID5) { - gctl_error(req, "plex %s is not a RAID5 plex", p->name); - goto out; - } - - cp = p->consumer; - error = g_access(cp, 1, 1, 0); - if (error) { - gctl_error(req, "cannot access consumer"); - goto out; - } - g_topology_unlock(); - - /* Reset the check pointer when using -f. */ - if (*flags & GV_FLAG_F) - p->synced = 0; - - bp = g_new_bio(); - if (bp == NULL) { - gctl_error(req, "cannot create BIO - out of memory"); - g_topology_lock(); - error = g_access(cp, -1, -1, 0); - goto out; - } - bp->bio_cmd = BIO_WRITE; - bp->bio_done = NULL; - bp->bio_data = g_malloc(p->stripesize, M_WAITOK | M_ZERO); - bp->bio_cflags |= GV_BIO_CHECK; - if (*rebuild) - bp->bio_cflags |= GV_BIO_PARITY; - bp->bio_offset = p->synced; - bp->bio_length = p->stripesize; - - /* Schedule it down ... */ - g_io_request(bp, cp); - - /* ... and wait for the result. */ - error = biowait(bp, "gwrite"); - g_free(bp->bio_data); - g_destroy_bio(bp); - - if (error) { - /* Incorrect parity. */ - if (error == EAGAIN) - rv = 1; - - /* Some other error happened. */ - else - gctl_error(req, "Parity check failed at offset 0x%jx, " - "errno %d", (intmax_t)p->synced, error); - - /* Correct parity. */ - } else - rv = 0; - - gctl_set_param(req, "offset", &p->synced, sizeof(p->synced)); - - /* Advance the checkpointer if there was no error. */ - if (rv == 0) - p->synced += p->stripesize; - - /* End of plex; reset the check pointer and signal it to the caller. */ - if (p->synced >= p->size) { - p->synced = 0; - rv = -2; - } - - g_topology_lock(); - error = g_access(cp, -1, -1, 0); - -out: - gctl_set_param(req, "rv", &rv, sizeof(rv)); -} +static int gv_rebuild_plex(struct gv_plex *); +static int gv_init_plex(struct gv_plex *); +static int gv_grow_plex(struct gv_plex *); void gv_start_obj(struct g_geom *gp, struct gctl_req *req) @@ -187,7 +49,7 @@ struct gv_plex *p; int *argc, *initsize; char *argv, buf[20]; - int err, i, type; + int i, type; argc = gctl_get_paraml(req, "argc", sizeof(*argc)); initsize = gctl_get_paraml(req, "initsize", sizeof(*initsize)); @@ -208,32 +70,16 @@ switch (type) { case GV_TYPE_VOL: v = gv_find_vol(sc, argv); - err = gv_start_vol(v); - if (err) { - if (err == EINPROGRESS) { - gctl_error(req, "cannot start volume " - "'%s': already in progress", argv); - } else { - gctl_error(req, "cannot start volume " - "'%s'; errno: %d", argv, err); - } - return; - } + if (v != NULL) + gv_post_event(sc, GV_EVENT_START_VOLUME, v, + NULL, *initsize, 0); break; case GV_TYPE_PLEX: p = gv_find_plex(sc, argv); - err = gv_start_plex(p); - if (err) { - if (err == EINPROGRESS) { - gctl_error(req, "cannot start plex " - "'%s': already in progress", argv); - } else { - gctl_error(req, "cannot start plex " - "'%s'; errno: %d", argv, err); - } - return; - } + if (p != NULL) + gv_post_event(sc, GV_EVENT_START_PLEX, p, NULL, + *initsize, 0); break; case GV_TYPE_SD: @@ -249,36 +95,53 @@ } } -static int +int gv_start_plex(struct gv_plex *p) { struct gv_volume *v; - int error; + struct gv_sd *s; + int error, grow; KASSERT(p != NULL, ("gv_start_plex: NULL p")); - if (p->state == GV_PLEX_UP) - return (0); +/* if (p->state == GV_PLEX_UP) + return (0);*/ error = 0; v = p->vol_sc; - if ((v != NULL) && (v->plexcount > 1)) - error = gv_sync(v); - else if (p->org == GV_PLEX_RAID5) { - if (p->state == GV_PLEX_DEGRADED) +/* if ((v != NULL) && (v->plexcount > 1)) + error = gv_sync(v);*/ + if (p->org == GV_PLEX_STRIPED) { + grow = 0; + LIST_FOREACH(s, &p->subdisks, in_plex) { + if (s->flags & GV_SD_GROW) { + grow = 1; + break; + } + } + if (grow) + error = gv_grow_plex(p); + } else if (p->org == GV_PLEX_RAID5) { + if (p->state > GV_PLEX_DEGRADED) { + LIST_FOREACH(s, &p->subdisks, in_plex) { + if (s->flags & GV_SD_GROW) { + error = gv_grow_plex(p); + return (error); + } + } + } else if (p->state == GV_PLEX_DEGRADED) { error = gv_rebuild_plex(p); - else + } else error = gv_init_plex(p); } return (error); } -static int +int gv_start_vol(struct gv_volume *v) { struct gv_plex *p; - struct gv_sd *s; int error; KASSERT(v != NULL, ("gv_start_vol: NULL v")); @@ -291,23 +154,7 @@ else if (v->plexcount == 1) { p = LIST_FIRST(&v->plexes); KASSERT(p != NULL, ("gv_start_vol: NULL p on %s", v->name)); - if (p->org == GV_PLEX_RAID5) { - switch (p->state) { - case GV_PLEX_DOWN: - error = gv_init_plex(p); - break; - case GV_PLEX_DEGRADED: - error = gv_rebuild_plex(p); - break; - default: - return (0); - } - } else { - LIST_FOREACH(s, &p->subdisks, in_plex) { - gv_set_sd_state(s, GV_SD_UP, - GV_SETSTATE_CONFIG); - } - } + error = gv_start_plex(p); } else error = gv_sync(v); @@ -319,7 +166,7 @@ { struct gv_softc *sc; struct gv_plex *p, *up; - struct gv_sync_args *sync; + int error; KASSERT(v != NULL, ("gv_sync: NULL v")); sc = v->vinumconf; @@ -336,20 +183,34 @@ if (up == NULL) return (ENXIO); + g_topology_lock(); + error = gv_access(v->provider, 1, 1, 0); + if (error) { + g_topology_unlock(); + printf("VINUM: sync from '%s' failed to access volume: %d\n", + up->name, error); + return (error); + } + g_topology_unlock(); + + /* Go through the good plex, and issue BIO's to all other plexes. */ LIST_FOREACH(p, &v->plexes, in_volume) { if ((p == up) || (p->state == GV_PLEX_UP)) continue; + /* XXX: Should we check if rebuilding too? */ if (p->flags & GV_PLEX_SYNCING) { return (EINPROGRESS); } + p->synced = 0; p->flags |= GV_PLEX_SYNCING; - sync = g_malloc(sizeof(*sync), M_WAITOK | M_ZERO); - sync->v = v; - sync->from = up; - sync->to = p; - sync->syncsize = GV_DFLT_SYNCSIZE; - kthread_create(gv_sync_td, sync, NULL, 0, 0, "gv_sync '%s'", - p->name); + printf("VINUM: starting sync of plex %s\n", p->name); + error = gv_sync_request(up, p, p->synced, + MIN(GV_DFLT_SYNCSIZE, up->size - p->synced), + BIO_READ, NULL); + if (error) { + printf("VINUM: error syncing plex %s\n", p->name); + break; + } } return (0); @@ -358,314 +219,129 @@ static int gv_rebuild_plex(struct gv_plex *p) { - struct gv_sync_args *sync; - - if (gv_is_open(p->geom)) - return (EBUSY); - - if (p->flags & GV_PLEX_SYNCING) - return (EINPROGRESS); - p->flags |= GV_PLEX_SYNCING; - - sync = g_malloc(sizeof(*sync), M_WAITOK | M_ZERO); - sync->to = p; - sync->syncsize = GV_DFLT_SYNCSIZE; - - kthread_create(gv_rebuild_td, sync, NULL, 0, 0, "gv_rebuild %s", - p->name); - - return (0); -} - -static int -gv_init_plex(struct gv_plex *p) -{ + struct gv_drive *d; struct gv_sd *s; + int error; - KASSERT(p != NULL, ("gv_init_plex: NULL p")); - +/* XXX: Is this safe? (Allows for mounted rebuild)*/ +/* if (gv_provider_is_open(p->vol_sc->provider)) + return (EBUSY);*/ + + if (p->flags & GV_PLEX_SYNCING || + p->flags & GV_PLEX_REBUILDING || + p->flags & GV_PLEX_GROWING) + return (EINPROGRESS); + /* + * Make sure that all subdisks have consumers. We won't allow a rebuild + * unless every subdisk have one. + */ LIST_FOREACH(s, &p->subdisks, in_plex) { - if (s->state == GV_SD_INITIALIZING) - return (EINPROGRESS); - gv_set_sd_state(s, GV_SD_INITIALIZING, GV_SETSTATE_FORCE); - s->init_size = GV_DFLT_SYNCSIZE; - kthread_create(gv_init_td, s, NULL, 0, 0, "gv_init %s", - s->name); + d = s->drive_sc; + if (d == NULL || (d->flags & GV_DRIVE_REFERENCED)) { + printf("VINUM: can't rebuild %s, subdisk(s) have no " + "drives\n", p->name); + return (ENXIO); + } } - - return (0); -} - -/* This thread is responsible for rebuilding a degraded RAID5 plex. */ -void -gv_rebuild_td(void *arg) -{ - struct bio *bp; - struct gv_plex *p; - struct g_consumer *cp; - struct gv_sync_args *sync; - u_char *buf; - off_t i; - int error; - - buf = NULL; - bp = NULL; - - sync = arg; - p = sync->to; + p->flags |= GV_PLEX_REBUILDING; p->synced = 0; - cp = p->consumer; + g_topology_assert_not(); g_topology_lock(); - error = g_access(cp, 1, 1, 0); + error = gv_access(p->vol_sc->provider, 1, 1, 0); if (error) { - g_topology_unlock(); - printf("GEOM_VINUM: rebuild of %s failed to access consumer: " - "%d\n", p->name, error); - kthread_exit(error); - } - g_topology_unlock(); - - buf = g_malloc(sync->syncsize, M_WAITOK); - - printf("GEOM_VINUM: rebuild of %s started\n", p->name); - i = 0; - for (i = 0; i < p->size; i += (p->stripesize * (p->sdcount - 1))) { -/* - if (i + sync->syncsize > p->size) - sync->syncsize = p->size - i; -*/ - bp = g_new_bio(); - if (bp == NULL) { - printf("GEOM_VINUM: rebuild of %s failed creating bio: " - "out of memory\n", p->name); - break; - } - bp->bio_cmd = BIO_WRITE; - bp->bio_done = NULL; - bp->bio_data = buf; - bp->bio_cflags |= GV_BIO_REBUILD; - bp->bio_offset = i; - bp->bio_length = p->stripesize; - - /* Schedule it down ... */ - g_io_request(bp, cp); - - /* ... and wait for the result. */ - error = biowait(bp, "gwrite"); - if (error) { - printf("GEOM_VINUM: rebuild of %s failed at offset %jd " - "errno: %d\n", p->name, i, error); - break; - } - g_destroy_bio(bp); - bp = NULL; + printf("VINUM: unable to access provider\n"); + return (0); } - - if (bp != NULL) - g_destroy_bio(bp); - if (buf != NULL) - g_free(buf); - - g_topology_lock(); - g_access(cp, -1, -1, 0); - gv_save_config_all(p->vinumconf); g_topology_unlock(); - p->flags &= ~GV_PLEX_SYNCING; - p->synced = 0; - - /* Successful initialization. */ - if (!error) - printf("GEOM_VINUM: rebuild of %s finished\n", p->name); - - g_free(sync); - kthread_exit(error); + gv_parity_request(p, GV_BIO_REBUILD, 0); + return (0); } -void -gv_sync_td(void *arg) +static int +gv_grow_plex(struct gv_plex *p) { - struct bio *bp; - struct gv_plex *p; - struct g_consumer *from, *to; - struct gv_sync_args *sync; - u_char *buf; - off_t i; - int error; - - sync = arg; - - from = sync->from->consumer; - to = sync->to->consumer; - - p = sync->to; - p->synced = 0; + struct gv_volume *v; + struct gv_sd *s; + off_t origsize, origlength; + int error, sdcount; - error = 0; + KASSERT(p != NULL, ("gv_grow_plex: NULL p")); + v = p->vol_sc; + KASSERT(v != NULL, ("gv_grow_plex: NULL v")); + if (p->flags & GV_PLEX_GROWING || + p->flags & GV_PLEX_SYNCING || + p->flags & GV_PLEX_REBUILDING) + return (EINPROGRESS); g_topology_lock(); - error = g_access(from, 1, 0, 0); - if (error) { - g_topology_unlock(); - printf("GEOM_VINUM: sync from '%s' failed to access " - "consumer: %d\n", sync->from->name, error); - g_free(sync); - kthread_exit(error); - } - error = g_access(to, 0, 1, 0); + error = gv_access(v->provider, 1, 1, 0); + g_topology_unlock(); if (error) { - g_access(from, -1, 0, 0); - g_topology_unlock(); - printf("GEOM_VINUM: sync to '%s' failed to access " - "consumer: %d\n", p->name, error); - g_free(sync); - kthread_exit(error); + printf("VINUM: unable to access provider\n"); + return (GV_ERR_ISBUSY); /*XXX: wrong errorcode */ } - g_topology_unlock(); - - printf("GEOM_VINUM: plex sync %s -> %s started\n", sync->from->name, - sync->to->name); - for (i = 0; i < p->size; i+= sync->syncsize) { - /* Read some bits from the good plex. */ - buf = g_read_data(from, i, sync->syncsize, &error); - if (buf == NULL) { - printf("GEOM_VINUM: sync read from '%s' failed at " - "offset %jd; errno: %d\n", sync->from->name, i, - error); - break; - } - - /* - * Create a bio and schedule it down on the 'bad' plex. We - * cannot simply use g_write_data() because we have to let the - * lower parts know that we are an initialization process and - * not a 'normal' request. - */ - bp = g_new_bio(); - if (bp == NULL) { - printf("GEOM_VINUM: sync write to '%s' failed at " - "offset %jd; out of memory\n", p->name, i); - g_free(buf); - break; - } - bp->bio_cmd = BIO_WRITE; - bp->bio_offset = i; - bp->bio_length = sync->syncsize; - bp->bio_data = buf; - bp->bio_done = NULL; - - /* - * This hack declare this bio as part of an initialization - * process, so that the lower levels allow it to get through. - */ - bp->bio_cflags |= GV_BIO_SYNCREQ; - /* Schedule it down ... */ - g_io_request(bp, to); - - /* ... and wait for the result. */ - error = biowait(bp, "gwrite"); - g_destroy_bio(bp); - g_free(buf); - if (error) { - printf("GEOM_VINUM: sync write to '%s' failed at " - "offset %jd; errno: %d\n", p->name, i, error); - break; - } - - /* Note that we have synced a little bit more. */ - p->synced += sync->syncsize; + /* XXX: This routine with finding origsize is used two other places as + * well, so we should create a function for it. */ + sdcount = p->sdcount; + LIST_FOREACH(s, &p->subdisks, in_plex) { + if (s->flags & GV_SD_GROW) + sdcount--; } - - g_topology_lock(); - g_access(from, -1, 0, 0); - g_access(to, 0, -1, 0); - gv_save_config_all(p->vinumconf); - g_topology_unlock(); - - /* Successful initialization. */ - if (!error) - printf("GEOM_VINUM: plex sync %s -> %s finished\n", - sync->from->name, sync->to->name); - - p->flags &= ~GV_PLEX_SYNCING; + s = LIST_FIRST(&p->subdisks); + if (s == NULL) { + printf("VINUM: error growing plex without subdisks"); + return (GV_ERR_NOTFOUND); + } + p->flags |= GV_PLEX_GROWING; + origsize = (sdcount - 1) * s->size; + origlength = (sdcount - 1) * p->stripesize; p->synced = 0; + printf("VINUM: starting growing of plex %s\n", p->name); + gv_grow_request(p, 0, MIN(origlength, origsize), BIO_READ, NULL); - g_free(sync); - kthread_exit(error); + return (0); } -void -gv_init_td(void *arg) +static int +gv_init_plex(struct gv_plex *p) { - struct gv_sd *s; struct gv_drive *d; - struct g_geom *gp; - struct g_consumer *cp; + struct gv_sd *s; int error; - off_t i, init_size, start, offset, length; - u_char *buf; - - s = arg; - KASSERT(s != NULL, ("gv_init_td: NULL s")); - d = s->drive_sc; - KASSERT(d != NULL, ("gv_init_td: NULL d")); - gp = d->geom; - KASSERT(gp != NULL, ("gv_init_td: NULL gp")); - - cp = LIST_FIRST(&gp->consumer); - KASSERT(cp != NULL, ("gv_init_td: NULL cp")); - - s->init_error = 0; - init_size = s->init_size; - start = s->drive_offset + s->initialized; - offset = s->drive_offset; - length = s->size; + off_t start; + caddr_t data; - buf = g_malloc(s->init_size, M_WAITOK | M_ZERO); - - g_topology_lock(); - error = g_access(cp, 0, 1, 0); - if (error) { - s->init_error = error; - g_topology_unlock(); - printf("GEOM_VINUM: subdisk '%s' init: failed to access " - "consumer; error: %d\n", s->name, error); - kthread_exit(error); - } - g_topology_unlock(); + KASSERT(p != NULL, ("gv_init_plex: NULL p")); - for (i = start; i < offset + length; i += init_size) { - error = g_write_data(cp, i, buf, init_size); - if (error) { - printf("GEOM_VINUM: subdisk '%s' init: write failed" - " at offset %jd (drive offset %jd); error %d\n", - s->name, (intmax_t)s->initialized, (intmax_t)i, - error); + LIST_FOREACH(s, &p->subdisks, in_plex) { + if (s->state == GV_SD_INITIALIZING) + return (EINPROGRESS); + gv_set_sd_state(s, GV_SD_INITIALIZING, GV_SETSTATE_FORCE); + s->init_size = GV_DFLT_SYNCSIZE; + start = s->drive_offset + s->initialized; + d = s->drive_sc; + if (d == NULL) { + printf("VINUM: subdisk %s has no drive yet\n", s->name); break; } - s->initialized += init_size; - } - - g_free(buf); - - g_topology_lock(); - g_access(cp, 0, -1, 0); - g_topology_unlock(); - if (error) { - s->init_error = error; - g_topology_lock(); - gv_set_sd_state(s, GV_SD_STALE, - GV_SETSTATE_FORCE | GV_SETSTATE_CONFIG); - g_topology_unlock(); - } else { + /* + * Take the lock here since we need to avoid a race in + * gv_init_request if the BIO is completed before the lock is + * released. + */ g_topology_lock(); - gv_set_sd_state(s, GV_SD_UP, GV_SETSTATE_CONFIG); + error = g_access(d->consumer, 0, 1, 0); g_topology_unlock(); - s->initialized = 0; - printf("GEOM_VINUM: subdisk '%s' init: finished successfully\n", - s->name); + if (error) { + printf("VINUM: error accessing consumer when " + "initializing %s\n", s->name); + break; /* XXX: Or continue..? */ + } + data = g_malloc(s->init_size, M_WAITOK | M_ZERO); + gv_init_request(s, start, data, s->init_size); } - kthread_exit(error); + return (0); } Index: sys/geom/vinum/geom_vinum_list.c =================================================================== RCS file: /srv/ncvs/src/sys/geom/vinum/geom_vinum_list.c,v retrieving revision 1.3 diff -u -u -r1.3 geom_vinum_list.c --- sys/geom/vinum/geom_vinum_list.c 6 Jan 2005 18:27:30 -0000 1.3 +++ sys/geom/vinum/geom_vinum_list.c 3 Nov 2007 02:40:17 -0000 @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2004 Lukas Ertl + * Copyright (c) 2004, 2007 Lukas Ertl * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -28,7 +28,6 @@ #include __FBSDID("$FreeBSD: src/sys/geom/vinum/geom_vinum_list.c,v 1.3 2005/01/06 18:27:30 imp Exp $"); -#include #include #include @@ -294,19 +293,36 @@ sbuf_printf(sb, "Plex %s:\tSize:\t%9jd bytes (%jd MB)\n", p->name, (intmax_t)p->size, (intmax_t)p->size / MEGABYTE); sbuf_printf(sb, "\t\tSubdisks: %8d\n", p->sdcount); - sbuf_printf(sb, "\t\tState: %s\n\t\tOrganization: %s", - gv_plexstate(p->state), gv_plexorg(p->org)); + sbuf_printf(sb, "\t\tState: %s\n", gv_plexstate(p->state)); + if ((p->flags & GV_PLEX_SYNCING) || + (p->flags & GV_PLEX_GROWING) || + (p->flags & GV_PLEX_REBUILDING)) { + sbuf_printf(sb, "\t\tSynced: "); + sbuf_printf(sb, "%16jd bytes (%d%%)\n", + (intmax_t)p->synced, + (int)((p->synced * 100) / p->size)); + } + sbuf_printf(sb, "\t\tOrganization: %s", gv_plexorg(p->org)); if (gv_is_striped(p)) { sbuf_printf(sb, "\tStripe size: %s\n", gv_roughlength(p->stripesize, 1)); } + sbuf_printf(sb, "\t\tFlags: %d\n", p->flags); if (p->vol_sc != NULL) { sbuf_printf(sb, "\t\tPart of volume %s\n", p->volume); } } else { - sbuf_printf(sb, "P %-18s %2s State: %s\tSubdisks: %5d" - "\tSize: %s\n", p->name, gv_plexorg_short(p->org), - gv_plexstate(p->state), p->sdcount, + sbuf_printf(sb, "P %-18s %2s State: ", p->name, + gv_plexorg_short(p->org)); + if ((p->flags & GV_PLEX_SYNCING) || + (p->flags & GV_PLEX_GROWING) || + (p->flags & GV_PLEX_REBUILDING)) { + sbuf_printf(sb, "S %d%%\t", (int)((p->synced * 100) / + p->size)); + } else { + sbuf_printf(sb, "%s\t", gv_plexstate(p->state)); + } + sbuf_printf(sb, "Subdisks: %5d\tSize: %s\n", p->sdcount, gv_roughlength(p->size, 0)); } @@ -388,6 +404,7 @@ s->drive_sc == NULL ? "*missing*" : s->drive_sc->name, (intmax_t)s->drive_offset, gv_roughlength(s->drive_offset, 1)); + sbuf_printf(sb, "\t\tFlags: %d\n", s->flags); } else { sbuf_printf(sb, "S %-21s State: ", s->name); if (s->state == GV_SD_INITIALIZING || @@ -447,6 +464,7 @@ sbuf_printf(sb, "\t\tAvailable: %11jd bytes (%jd MB)\n", (intmax_t)d->avail, (intmax_t)d->avail / MEGABYTE); sbuf_printf(sb, "\t\tState: %s\n", gv_drivestate(d->state)); + sbuf_printf(sb, "\t\tFlags: %d\n", d->flags); /* Be very verbose. */ if (flags & GV_FLAG_VV) { @@ -461,7 +479,7 @@ sbuf_printf(sb, "D %-21s State: %s\t/dev/%s\tA: %jd/%jd MB " "(%d%%)\n", d->name, gv_drivestate(d->state), d->device, (intmax_t)d->avail / MEGABYTE, (intmax_t)d->size / MEGABYTE, - (int)((d->avail * 100) / d->size)); + d->size > 0 ? (int)((d->avail * 100) / d->size) : 0); } /* Recursive listing. */ Index: sys/geom/vinum/geom_vinum_move.c =================================================================== RCS file: /srv/ncvs/src/sys/geom/vinum/geom_vinum_move.c,v retrieving revision 1.3 diff -u -u -r1.3 geom_vinum_move.c --- sys/geom/vinum/geom_vinum_move.c 8 Feb 2006 21:32:45 -0000 1.3 +++ sys/geom/vinum/geom_vinum_move.c 3 Nov 2007 02:40:17 -0000 @@ -32,26 +32,21 @@ #include __FBSDID("$FreeBSD: src/sys/geom/vinum/geom_vinum_move.c,v 1.3 2006/02/08 21:32:45 le Exp $"); -#include #include -#include #include #include #include #include -#include - -static int gv_move_sd(struct gv_softc *, struct gctl_req *, - struct gv_sd *, char *, int); void gv_move(struct g_geom *gp, struct gctl_req *req) { struct gv_softc *sc; struct gv_sd *s; + struct gv_drive *d; char buf[20], *destination, *object; - int *argc, err, *flags, i, type; + int *argc, *flags, i, type; sc = gp->softc; @@ -66,6 +61,7 @@ gctl_error(req, "destination '%s' is not a drive", destination); return; } + d = gv_find_drive(sc, destination); /* * We start with 1 here, because argv[0] on the command line is the @@ -89,68 +85,60 @@ gctl_error(req, "unknown subdisk '%s'", object); return; } - err = gv_move_sd(sc, req, s, destination, *flags); - if (err) - return; + gv_post_event(sc, GV_EVENT_MOVE_SD, s, d, *flags, 0); } - - gv_save_config_all(sc); } /* Move a subdisk. */ -static int -gv_move_sd(struct gv_softc *sc, struct gctl_req *req, struct gv_sd *cursd, char *destination, int flags) +int +gv_move_sd(struct gv_softc *sc, struct gv_sd *cursd, + struct gv_drive *destination, int flags) { struct gv_drive *d; struct gv_sd *newsd, *s, *s2; struct gv_plex *p; - struct g_consumer *cp; - char errstr[ERRBUFSIZ]; int err; g_topology_assert(); KASSERT(cursd != NULL, ("gv_move_sd: NULL cursd")); + KASSERT(destination != NULL, ("gv_move_sd: NULL destination")); - cp = cursd->consumer; + d = cursd->drive_sc; - if (cp != NULL && (cp->acr || cp->acw || cp->ace)) { - gctl_error(req, "subdisk '%s' is busy", cursd->name); - return (-1); + /* XXX: Can't do a move without unmounting. Perhaps okay. */ + if (gv_consumer_is_open(d->consumer) || + gv_consumer_is_open(destination->consumer)) { + printf("VINUM: consumers on current and destination drive " + " still open\n"); + return (GV_ERR_ISBUSY); } if (!(flags && GV_FLAG_F)) { - gctl_error(req, "-f flag not passed; move would be " - "destructive"); - return (-1); - } - - d = gv_find_drive(sc, destination); - if (d == NULL) { - gctl_error(req, "destination drive '%s' not found", - destination); - return (-1); + printf("VINUM: -f flag not passed; move would be " + "destructive\n"); + return (GV_ERR_INVFLAG); } - if (d == cursd->drive_sc) { - gctl_error(req, "subdisk '%s' already on drive '%s'", - cursd->name, destination); - return (-1); + if (destination == cursd->drive_sc) { + printf("VINUM: subdisk '%s' already on drive '%s'\n", + cursd->name, destination->name); + return (GV_ERR_ISATTACHED); } /* XXX: Does it have to be part of a plex? */ p = gv_find_plex(sc, cursd->plex); if (p == NULL) { - gctl_error(req, "subdisk '%s' is not part of a plex", + printf("VINUM: subdisk '%s' is not part of a plex\n", cursd->name); - return (-1); + return (GV_ERR_NOTFOUND); } - + /* Stale the old subdisk. */ err = gv_set_sd_state(cursd, GV_SD_STALE, GV_SETSTATE_FORCE | GV_SETSTATE_CONFIG); if (err) { - gctl_error(req, "could not set the subdisk '%s' to state " - "'stale'", cursd->name); + printf("VINUM: could not set the subdisk '%s' to state " + "'stale'\n", cursd->name); return (err); } @@ -163,54 +151,27 @@ newsd->plex_offset = cursd->plex_offset; newsd->size = cursd->size; newsd->drive_offset = -1; - strncpy(newsd->name, cursd->name, GV_MAXSDNAME); - strncpy(newsd->drive, destination, GV_MAXDRIVENAME); - strncpy(newsd->plex, cursd->plex, GV_MAXPLEXNAME); + strlcpy(newsd->name, cursd->name, GV_MAXSDNAME); + strlcpy(newsd->drive, destination->name, GV_MAXDRIVENAME); + strlcpy(newsd->plex, cursd->plex, GV_MAXPLEXNAME); newsd->state = GV_SD_STALE; newsd->vinumconf = cursd->vinumconf; - err = gv_sd_to_drive(sc, d, newsd, errstr, ERRBUFSIZ); + err = gv_sd_to_drive(newsd, destination); if (err) { /* XXX not enough free space? */ - gctl_error(req, errstr); g_free(newsd); return (err); } /* Replace the old sd by the new one. */ - if (cp != NULL) - g_detach(cp); LIST_FOREACH_SAFE(s, &p->subdisks, in_plex, s2) { if (s == cursd) { - p->sdcount--; - p->size -= s->size; - err = gv_rm_sd(sc, req, s, 0); - if (err) - return (err); - - } - } - - gv_sd_to_plex(p, newsd, 1); - - /* Creates the new providers.... */ - gv_drive_modify(d); - - /* And reconnect the consumer ... */ - if (cp != NULL) { - newsd->consumer = cp; - err = g_attach(cp, newsd->provider); - if (err) { - g_destroy_consumer(cp); - gctl_error(req, "proposed move would create a loop " - "in GEOM config"); - return (err); + gv_rm_sd(sc, s); } } - + gv_sd_to_plex(newsd, p); LIST_INSERT_HEAD(&sc->subdisks, newsd, sd); - gv_save_config_all(sc); - return (0); } Index: sys/geom/vinum/geom_vinum_plex.c =================================================================== RCS file: /srv/ncvs/src/sys/geom/vinum/geom_vinum_plex.c,v retrieving revision 1.17 diff -u -u -r1.17 geom_vinum_plex.c --- sys/geom/vinum/geom_vinum_plex.c 6 Jan 2006 18:03:17 -0000 1.17 +++ sys/geom/vinum/geom_vinum_plex.c 3 Nov 2007 02:40:17 -0000 @@ -1,5 +1,6 @@ /*- - * Copyright (c) 2004 Lukas Ertl + * Copyright (c) 2004, 2007 Lukas Ertl + * Copyright (c) 2007 Ulf Lilleengen * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -29,13 +30,8 @@ #include #include -#include -#include -#include #include #include -#include -#include #include #include @@ -43,329 +39,414 @@ #include #include -static void gv_plex_completed_request(struct gv_plex *, struct bio *); -static void gv_plex_normal_request(struct gv_plex *, struct bio *); -static void gv_plex_worker(void *); -static int gv_check_parity(struct gv_plex *, struct bio *, - struct gv_raid5_packet *); -static int gv_normal_parity(struct gv_plex *, struct bio *, - struct gv_raid5_packet *); - -/* XXX: is this the place to catch dying subdisks? */ -static void -gv_plex_orphan(struct g_consumer *cp) +static int gv_check_parity(struct gv_plex *, struct bio *, + struct gv_raid5_packet *); +static int gv_normal_parity(struct gv_plex *, struct bio *, + struct gv_raid5_packet *); +static void gv_plex_flush(struct gv_plex *); +static int gv_plex_offset(struct gv_plex *, off_t, off_t, off_t *, off_t *, + int *, int); +static int gv_plex_normal_request(struct gv_plex *, struct bio *, off_t, + off_t, caddr_t); +void +gv_plex_start(struct gv_plex *p, struct bio *bp) { - struct g_geom *gp; - struct gv_plex *p; - int error; + struct bio *cbp; + struct gv_sd *s; + struct gv_raid5_packet *wp; + caddr_t addr; + off_t bcount, boff, len; - g_topology_assert(); - gp = cp->geom; - g_trace(G_T_TOPOLOGY, "gv_plex_orphan(%s)", gp->name); - - if (cp->acr != 0 || cp->acw != 0 || cp->ace != 0) - g_access(cp, -cp->acr, -cp->acw, -cp->ace); - error = cp->provider->error; - if (error == 0) - error = ENXIO; - g_detach(cp); - g_destroy_consumer(cp); - if (!LIST_EMPTY(&gp->consumer)) - return; + bcount = bp->bio_length; + addr = bp->bio_data; + boff = bp->bio_offset; - p = gp->softc; - if (p != NULL) { - gv_kill_plex_thread(p); - p->geom = NULL; - p->provider = NULL; - p->consumer = NULL; - } - gp->softc = NULL; - g_wither_geom(gp, error); -} + /* Walk over the whole length of the request, we might split it up. */ + while (bcount > 0) { + wp = NULL; -void -gv_plex_done(struct bio *bp) -{ - struct gv_plex *p; + /* + * RAID5 plexes need special treatment, as a single request + * might involve several read/write sub-requests. + */ + if (p->org == GV_PLEX_RAID5) { + wp = gv_raid5_start(p, bp, addr, boff, bcount); + if (wp == NULL) + return; + + len = wp->length; - p = bp->bio_from->geom->softc; - bp->bio_cflags |= GV_BIO_DONE; - mtx_lock(&p->bqueue_mtx); - bioq_insert_tail(p->bqueue, bp); - wakeup(p); - mtx_unlock(&p->bqueue_mtx); + if (TAILQ_EMPTY(&wp->bits)) + g_free(wp); + else if (wp->lockbase != -1) + TAILQ_INSERT_TAIL(&p->packets, wp, list); + + /* + * Requests to concatenated and striped plexes go straight + * through. + */ + } else { + len = gv_plex_normal_request(p, bp, boff, bcount, addr); + } + if (len < 0) + return; + + bcount -= len; + addr += len; + boff += len; + } + + /* + * Fire off all sub-requests. We get the correct consumer (== drive) + * to send each request to via the subdisk that was stored in + * cbp->bio_caller1. + */ + cbp = bioq_takefirst(p->bqueue); + while (cbp != NULL) { + /* + * RAID5 sub-requests need to come in correct order, otherwise + * we trip over the parity, as it might be overwritten by + * another sub-request. We abuse cbp->bio_caller2 to mark + * potential overlap situations. + */ + if (cbp->bio_caller2 != NULL && gv_stripe_active(p, cbp)) { + /* Park the bio on the waiting queue. */ + cbp->bio_cflags |= GV_BIO_ONHOLD; + bioq_disksort(p->wqueue, cbp); + } else { + s = cbp->bio_caller1; + g_io_request(cbp, s->drive_sc->consumer); + } + cbp = bioq_takefirst(p->bqueue); + } } -/* Find the correct subdisk to send the bio to and build a bio to send. */ static int -gv_plexbuffer(struct gv_plex *p, struct bio *bp, caddr_t addr, off_t boff, off_t bcount) +gv_plex_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off, + off_t *real_len, int *sdno, int growing) { - struct g_geom *gp; struct gv_sd *s; - struct bio *cbp, *pbp; - int i, sdno; - off_t len_left, real_len, real_off; - off_t stripeend, stripeno, stripestart; + int i, sdcount; + off_t len_left, stripeend, stripeno, stripestart; - if (p == NULL || LIST_EMPTY(&p->subdisks)) - return (ENXIO); - - s = NULL; - gp = bp->bio_to->geom; - - /* - * We only handle concatenated and striped plexes here. RAID5 plexes - * are handled in build_raid5_request(). - */ switch (p->org) { case GV_PLEX_CONCAT: /* * Find the subdisk where this request starts. The subdisks in * this list must be ordered by plex_offset. */ + i = 0; LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->plex_offset <= boff && - s->plex_offset + s->size > boff) + s->plex_offset + s->size > boff) { + *sdno = i; break; + } + i++; } /* Subdisk not found. */ - if (s == NULL) - return (ENXIO); + if (s == NULL || s->drive_sc == NULL) + return (GV_ERR_NOTFOUND); /* Calculate corresponding offsets on disk. */ - real_off = boff - s->plex_offset; - len_left = s->size - real_off; - real_len = (bcount > len_left) ? len_left : bcount; + *real_off = boff - s->plex_offset; + len_left = s->size - (*real_off); + KASSERT(len_left >= 0, ("gv_plex_offset: len_left < 0")); + *real_len = (bcount > len_left) ? len_left : bcount; break; case GV_PLEX_STRIPED: /* The number of the stripe where the request starts. */ stripeno = boff / p->stripesize; + KASSERT(stripeno >= 0, ("gv_plex_offset: stripeno < 0")); - /* The number of the subdisk where the stripe resides. */ - sdno = stripeno % p->sdcount; + /* Take growing subdisks into account when calculating. */ + sdcount = gv_sdcount(p, (boff >= p->synced)); - /* Find the right subdisk. */ - i = 0; - LIST_FOREACH(s, &p->subdisks, in_plex) { - if (i == sdno) - break; - i++; - } + /* Only take p->synced into calculation if we're growing. */ + if (!(boff + bcount <= p->synced) && + (p->flags & GV_PLEX_GROWING) && + !growing) + return (GV_ERR_ISBUSY); + /* The number of the subdisk where the stripe resides. */ + *sdno = stripeno % sdcount; - /* Subdisk not found. */ - if (s == NULL) - return (ENXIO); + KASSERT(sdno >= 0, ("gv_plex_offset: sdno < 0")); /* The offset of the stripe from the start of the subdisk. */ - stripestart = (stripeno / p->sdcount) * + stripestart = (stripeno / sdcount) * p->stripesize; + KASSERT(stripestart >= 0, ("gv_plex_offset: stripestart < 0")); /* The offset at the end of the stripe. */ stripeend = stripestart + p->stripesize; /* The offset of the request on this subdisk. */ - real_off = boff - (stripeno * p->stripesize) + + *real_off = boff - (stripeno * p->stripesize) + stripestart; /* The length left in this stripe. */ - len_left = stripeend - real_off; + len_left = stripeend - *real_off; + KASSERT(len_left >= 0, ("gv_plex_offset: len_left < 0")); - real_len = (bcount <= len_left) ? bcount : len_left; + *real_len = (bcount <= len_left) ? bcount : len_left; break; default: - return (EINVAL); + return (GV_ERR_PLEXORG); } + return (0); +} + +/* + * Prepare a normal plex request. + */ +static int +gv_plex_normal_request(struct gv_plex *p, struct bio *bp, off_t boff, + off_t bcount, caddr_t addr) +{ + struct gv_sd *s; + struct bio *cbp; + off_t real_len, real_off; + int i, err, sdno; + + err = ENXIO; + s = NULL; + sdno = -1; + real_len = real_off = 0; + + if (p == NULL || LIST_EMPTY(&p->subdisks)) + goto bad; + + err = gv_plex_offset(p, boff, bcount, &real_off, + &real_len, &sdno, (bp->bio_pflags & GV_BIO_SYNCREQ)); + /* If the request was blocked, put it into wait. */ + if (err == GV_ERR_ISBUSY) { + bioq_disksort(p->rqueue, bp); + return (-1); /* "Fail", and delay request. */ + } + if (err) + goto bad; + + /* Find the right subdisk. */ + i = 0; + LIST_FOREACH(s, &p->subdisks, in_plex) { + if (i == sdno) + break; + i++; + } + + /* Subdisk not found. */ + if (s == NULL || s->drive_sc == NULL) + goto bad; /* Now check if we can handle the request on this subdisk. */ switch (s->state) { case GV_SD_UP: /* If the subdisk is up, just continue. */ break; - case GV_SD_STALE: if (!(bp->bio_cflags & GV_BIO_SYNCREQ)) - return (ENXIO); + goto bad; - printf("GEOM_VINUM: sd %s is initializing\n", s->name); + printf("VINUM: sd %s is initializing\n", s->name); gv_set_sd_state(s, GV_SD_INITIALIZING, GV_SETSTATE_FORCE); break; - case GV_SD_INITIALIZING: if (bp->bio_cmd == BIO_READ) - return (ENXIO); + goto bad; break; - default: /* All other subdisk states mean it's not accessible. */ - return (ENXIO); + err = EINVAL; + goto bad; } /* Clone the bio and adjust the offsets and sizes. */ cbp = g_clone_bio(bp); - if (cbp == NULL) - return (ENOMEM); - cbp->bio_offset = real_off; + if (cbp == NULL) { + err = ENOMEM; + goto bad; + } + cbp->bio_offset = real_off + s->drive_offset; cbp->bio_length = real_len; cbp->bio_data = addr; - cbp->bio_done = g_std_done; - cbp->bio_caller2 = s->consumer; - if ((bp->bio_cflags & GV_BIO_SYNCREQ)) { + cbp->bio_done = gv_done; + cbp->bio_caller1 = s; + if ((bp->bio_cflags & GV_BIO_SYNCREQ)) cbp->bio_cflags |= GV_BIO_SYNCREQ; - cbp->bio_done = gv_plex_done; - } - if (bp->bio_driver1 == NULL) { - bp->bio_driver1 = cbp; - } else { - pbp = bp->bio_driver1; - while (pbp->bio_caller1 != NULL) - pbp = pbp->bio_caller1; - pbp->bio_caller1 = cbp; - } - - return (0); + /* Store the sub-requests now and let others issue them. */ + bioq_insert_tail(p->bqueue, cbp); + return (real_len); +bad: + /* Building the sub-request failed. */ + printf("VINUM: plex request failed for "); + g_print_bio(bp); + printf("\n"); + g_io_deliver(bp, err); + return (-1); } -static void -gv_plex_start(struct bio *bp) +/* + * Handle a completed request to a striped or concatenated plex. + */ +void +gv_plex_normal_done(struct gv_plex *p, struct bio *bp) { - struct gv_plex *p; - - switch(bp->bio_cmd) { - case BIO_READ: - case BIO_WRITE: - case BIO_DELETE: - break; - case BIO_GETATTR: - default: - g_io_deliver(bp, EOPNOTSUPP); - return; - } + struct bio *pbp; - /* - * We cannot handle this request if too many of our subdisks are - * inaccessible. - */ - p = bp->bio_to->geom->softc; - if ((p->state < GV_PLEX_DEGRADED) && - !(bp->bio_cflags & GV_BIO_SYNCREQ)) { - g_io_deliver(bp, ENXIO); - return; + pbp = bp->bio_parent; + if (pbp->bio_error == 0) + pbp->bio_error = bp->bio_error; + g_destroy_bio(bp); + pbp->bio_inbed++; + if (pbp->bio_children == pbp->bio_inbed) { + /* Just set it to length since multiple plexes will + * screw things up. */ + pbp->bio_completed = pbp->bio_length; + if (pbp->bio_cflags & GV_BIO_SYNCREQ) + gv_sync_complete(p, pbp); + else if (pbp->bio_pflags & GV_BIO_SYNCREQ) + gv_grow_complete(p, pbp); + else + g_io_deliver(pbp, pbp->bio_error); } - - mtx_lock(&p->bqueue_mtx); - bioq_disksort(p->bqueue, bp); - wakeup(p); - mtx_unlock(&p->bqueue_mtx); } -static void -gv_plex_worker(void *arg) +/* + * Handle a completed request to a RAID-5 plex. + */ +void +gv_plex_raid5_done(struct gv_plex *p, struct bio *bp) { - struct bio *bp; - struct gv_plex *p; - struct gv_sd *s; + struct gv_softc *sc; + struct bio *cbp, *pbp; + struct gv_bioq *bq, *bq2; + struct gv_raid5_packet *wp; + off_t completed; + int i; - p = arg; - KASSERT(p != NULL, ("NULL p")); + completed = 0; + sc = p->vinumconf; + wp = bp->bio_caller2; - mtx_lock(&p->bqueue_mtx); - for (;;) { - /* We were signaled to exit. */ - if (p->flags & GV_PLEX_THREAD_DIE) + switch (bp->bio_parent->bio_cmd) { + case BIO_READ: + if (wp == NULL) { + completed = bp->bio_completed; break; + } - /* Take the first BIO from our queue. */ - bp = bioq_takefirst(p->bqueue); - if (bp == NULL) { - msleep(p, &p->bqueue_mtx, PRIBIO, "-", hz/10); - continue; - } - mtx_unlock(&p->bqueue_mtx); - - /* A completed request. */ - if (bp->bio_cflags & GV_BIO_DONE) { - if (bp->bio_cflags & GV_BIO_SYNCREQ || - bp->bio_cflags & GV_BIO_REBUILD) { - s = bp->bio_to->private; - if (bp->bio_error == 0) - s->initialized += bp->bio_length; - if (s->initialized >= s->size) { - g_topology_lock(); - gv_set_sd_state(s, GV_SD_UP, - GV_SETSTATE_CONFIG); - g_topology_unlock(); - s->initialized = 0; + TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { + if (bq->bp != bp) + continue; + TAILQ_REMOVE(&wp->bits, bq, queue); + g_free(bq); + for (i = 0; i < wp->length; i++) + wp->data[i] ^= bp->bio_data[i]; + break; + } + if (TAILQ_EMPTY(&wp->bits)) { + completed = wp->length; + if (wp->lockbase != -1) { + TAILQ_REMOVE(&p->packets, wp, list); + /* Bring the waiting bios back into the game. */ + pbp = bioq_takefirst(p->wqueue); + while (pbp != NULL) { + mtx_lock(&sc->queue_mtx); + bioq_disksort(sc->bqueue, pbp); + mtx_unlock(&sc->queue_mtx); + pbp = bioq_takefirst(p->wqueue); } } + g_free(wp); + } - if (bp->bio_cflags & GV_BIO_SYNCREQ) - g_std_done(bp); - else - gv_plex_completed_request(p, bp); - /* - * A sub-request that was hold back because it interfered with - * another sub-request. - */ - } else if (bp->bio_cflags & GV_BIO_ONHOLD) { - /* Is it still locked out? */ - if (gv_stripe_active(p, bp)) { - /* Park the bio on the waiting queue. */ - mtx_lock(&p->bqueue_mtx); - bioq_disksort(p->wqueue, bp); - mtx_unlock(&p->bqueue_mtx); - } else { - bp->bio_cflags &= ~GV_BIO_ONHOLD; - g_io_request(bp, bp->bio_caller2); - } + break; - /* A normal request to this plex. */ - } else - gv_plex_normal_request(p, bp); + case BIO_WRITE: + /* XXX can this ever happen? */ + if (wp == NULL) { + completed = bp->bio_completed; + break; + } - mtx_lock(&p->bqueue_mtx); - } - mtx_unlock(&p->bqueue_mtx); - p->flags |= GV_PLEX_THREAD_DEAD; - wakeup(p); + /* Check if we need to handle parity data. */ + TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { + if (bq->bp != bp) + continue; + TAILQ_REMOVE(&wp->bits, bq, queue); + g_free(bq); + cbp = wp->parity; + if (cbp != NULL) { + for (i = 0; i < wp->length; i++) + cbp->bio_data[i] ^= bp->bio_data[i]; + } + break; + } - kthread_exit(ENXIO); -} + /* Handle parity data. */ + if (TAILQ_EMPTY(&wp->bits)) { + if (bp->bio_parent->bio_cflags & GV_BIO_CHECK) + i = gv_check_parity(p, bp, wp); + else + i = gv_normal_parity(p, bp, wp); -static int -gv_normal_parity(struct gv_plex *p, struct bio *bp, struct gv_raid5_packet *wp) -{ - struct bio *cbp, *pbp; - int finished, i; + /* All of our sub-requests have finished. */ + if (i) { + completed = wp->length; + TAILQ_REMOVE(&p->packets, wp, list); + /* Bring the waiting bios back into the game. */ + pbp = bioq_takefirst(p->wqueue); + while (pbp != NULL) { + mtx_lock(&sc->queue_mtx); + bioq_disksort(sc->bqueue, pbp); + mtx_unlock(&sc->queue_mtx); + pbp = bioq_takefirst(p->wqueue); + } + g_free(wp); + } + } - finished = 1; + break; + } - if (wp->waiting != NULL) { - pbp = wp->waiting; - wp->waiting = NULL; - cbp = wp->parity; - for (i = 0; i < wp->length; i++) - cbp->bio_data[i] ^= pbp->bio_data[i]; - g_io_request(pbp, pbp->bio_caller2); - finished = 0; + pbp = bp->bio_parent; + if (pbp->bio_error == 0) + pbp->bio_error = bp->bio_error; + pbp->bio_completed += completed; - } else if (wp->parity != NULL) { - cbp = wp->parity; - wp->parity = NULL; - g_io_request(cbp, cbp->bio_caller2); - finished = 0; + /* When the original request is finished, we deliver it. */ + pbp->bio_inbed++; + if (pbp->bio_inbed == pbp->bio_children) { + /* Hand it over for checking or delivery. */ + if (pbp->bio_cmd == BIO_WRITE && + (pbp->bio_cflags & GV_BIO_CHECK)) { + gv_parity_complete(p, pbp); + } else if (pbp->bio_cmd == BIO_WRITE && + (pbp->bio_cflags & GV_BIO_REBUILD)) { + gv_rebuild_complete(p, pbp); + } else if (pbp->bio_cflags & GV_BIO_INIT) { + gv_init_complete(p, pbp); + } else if (pbp->bio_pflags & GV_BIO_SYNCREQ) { + gv_grow_complete(p, pbp); + } else { + g_io_deliver(pbp, pbp->bio_error); + } } - return (finished); + /* Clean up what we allocated. */ + if (bp->bio_cflags & GV_BIO_MALLOC) + g_free(bp->bio_data); + g_destroy_bio(bp); } static int gv_check_parity(struct gv_plex *p, struct bio *bp, struct gv_raid5_packet *wp) { struct bio *pbp; + struct gv_sd *s; int err, finished, i; err = 0; @@ -374,7 +455,8 @@ if (wp->waiting != NULL) { pbp = wp->waiting; wp->waiting = NULL; - g_io_request(pbp, pbp->bio_caller2); + s = pbp->bio_caller1; + g_io_request(pbp, s->drive_sc->consumer); finished = 0; } else if (wp->parity != NULL) { @@ -395,7 +477,8 @@ /* ... but we rebuild it. */ if (bp->bio_parent->bio_cflags & GV_BIO_PARITY) { - g_io_request(pbp, pbp->bio_caller2); + s = pbp->bio_caller1; + g_io_request(pbp, s->drive_sc->consumer); finished = 0; } } @@ -414,448 +497,504 @@ return (finished); } -void -gv_plex_completed_request(struct gv_plex *p, struct bio *bp) +static int +gv_normal_parity(struct gv_plex *p, struct bio *bp, struct gv_raid5_packet *wp) { struct bio *cbp, *pbp; - struct gv_bioq *bq, *bq2; - struct gv_raid5_packet *wp; - int i; - - wp = bp->bio_driver1; - - switch (bp->bio_parent->bio_cmd) { - case BIO_READ: - if (wp == NULL) - break; - - TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { - if (bq->bp == bp) { - TAILQ_REMOVE(&wp->bits, bq, queue); - g_free(bq); - for (i = 0; i < wp->length; i++) - wp->data[i] ^= bp->bio_data[i]; - break; - } - } - if (TAILQ_EMPTY(&wp->bits)) { - bp->bio_parent->bio_completed += wp->length; - if (wp->lockbase != -1) { - TAILQ_REMOVE(&p->packets, wp, list); - /* Bring the waiting bios back into the game. */ - mtx_lock(&p->bqueue_mtx); - pbp = bioq_takefirst(p->wqueue); - while (pbp != NULL) { - bioq_disksort(p->bqueue, pbp); - pbp = bioq_takefirst(p->wqueue); - } - mtx_unlock(&p->bqueue_mtx); - } - g_free(wp); - } + struct gv_sd *s; + int finished, i; - break; + finished = 1; - case BIO_WRITE: - if (wp == NULL) - break; + if (wp->waiting != NULL) { + pbp = wp->waiting; + wp->waiting = NULL; + cbp = wp->parity; + for (i = 0; i < wp->length; i++) + cbp->bio_data[i] ^= pbp->bio_data[i]; + s = pbp->bio_caller1; + g_io_request(pbp, s->drive_sc->consumer); + finished = 0; - /* Check if we need to handle parity data. */ - TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { - if (bq->bp == bp) { - TAILQ_REMOVE(&wp->bits, bq, queue); - g_free(bq); - cbp = wp->parity; - if (cbp != NULL) { - for (i = 0; i < wp->length; i++) - cbp->bio_data[i] ^= - bp->bio_data[i]; - } - break; - } - } + } else if (wp->parity != NULL) { + cbp = wp->parity; + wp->parity = NULL; + s = cbp->bio_caller1; + g_io_request(cbp, s->drive_sc->consumer); + finished = 0; + } - /* Handle parity data. */ - if (TAILQ_EMPTY(&wp->bits)) { - if (bp->bio_parent->bio_cflags & GV_BIO_CHECK) - i = gv_check_parity(p, bp, wp); - else - i = gv_normal_parity(p, bp, wp); + return (finished); +} - /* All of our sub-requests have finished. */ - if (i) { - bp->bio_parent->bio_completed += wp->length; - TAILQ_REMOVE(&p->packets, wp, list); - /* Bring the waiting bios back into the game. */ - mtx_lock(&p->bqueue_mtx); - pbp = bioq_takefirst(p->wqueue); - while (pbp != NULL) { - bioq_disksort(p->bqueue, pbp); - pbp = bioq_takefirst(p->wqueue); - } - mtx_unlock(&p->bqueue_mtx); - g_free(wp); - } - } +static void +gv_plex_flush(struct gv_plex *p) +{ + struct gv_softc *sc; + struct bio *bp; - break; + sc = p->vinumconf; + bp = bioq_takefirst(p->rqueue); + while (bp != NULL) { + gv_plex_start(p, bp); + bp = bioq_takefirst(p->rqueue); } +} - pbp = bp->bio_parent; - if (pbp->bio_error == 0) - pbp->bio_error = bp->bio_error; +int +gv_sync_request(struct gv_plex *from, struct gv_plex *to, off_t offset, + off_t length, int type, caddr_t data) +{ + struct bio *bp; - /* When the original request is finished, we deliver it. */ - pbp->bio_inbed++; - if (pbp->bio_inbed == pbp->bio_children) - g_io_deliver(pbp, pbp->bio_error); + bp = g_new_bio(); + if (bp == NULL) { + printf("VINUM: sync from '%s' failed at offset " + " %jd; out of memory\n", from->name, offset); + return (ENOMEM); + } + bp->bio_length = length; + bp->bio_done = gv_done; + bp->bio_cflags |= GV_BIO_SYNCREQ; + bp->bio_offset = offset; + bp->bio_caller2 = to; + bp->bio_cmd = type; + if (data == NULL) + data = g_malloc(length, M_WAITOK); + bp->bio_cflags |= GV_BIO_MALLOC; /* Free on the next run. */ + bp->bio_data = data; - /* Clean up what we allocated. */ - if (bp->bio_cflags & GV_BIO_MALLOC) - g_free(bp->bio_data); - g_destroy_bio(bp); + /* Send down next. */ + gv_plex_start(from, bp); + return (0); } -void -gv_plex_normal_request(struct gv_plex *p, struct bio *bp) +/* + * Handle a finished plex sync bio. + */ +int +gv_sync_complete(struct gv_plex *to, struct bio *bp) { - struct bio *cbp, *pbp; - struct gv_bioq *bq, *bq2; - struct gv_raid5_packet *wp, *wp2; - caddr_t addr; - off_t bcount, boff; + struct gv_plex *from, *p; + struct gv_sd *s; + struct gv_volume *v; + struct gv_softc *sc; + off_t offset; int err; - bcount = bp->bio_length; - addr = bp->bio_data; - boff = bp->bio_offset; + g_topology_assert_not(); - /* Walk over the whole length of the request, we might split it up. */ - while (bcount > 0) { - wp = NULL; + err = 0; + from = bp->bio_caller2; + v = to->vol_sc; + sc = v->vinumconf; + + /* If it was a read, write it. */ + if (bp->bio_cmd == BIO_READ) { + err = gv_sync_request(from, to, bp->bio_offset, bp->bio_length, + BIO_WRITE, bp->bio_data); + /* If it was a write, read the next one. */ + } else if (bp->bio_cmd == BIO_WRITE) { + if (bp->bio_cflags & GV_BIO_MALLOC) + g_free(bp->bio_data); + to->synced += bp->bio_length; + /* If we're finished, clean up. */ + if (bp->bio_offset + bp->bio_length >= from->size) { + printf("VINUM: syncing of %s from %s completed\n", + to->name, from->name); + to->flags &= ~GV_PLEX_SYNCING; + to->synced = 0; + /* Update our state. */ + LIST_FOREACH(s, &to->subdisks, in_plex) + gv_set_sd_state(s, GV_SD_UP, 0); + } else { + offset = bp->bio_offset + bp->bio_length; + err = gv_sync_request(from, to, offset, + MIN(bp->bio_length, from->size - offset), + BIO_READ, NULL); + } + } + g_destroy_bio(bp); + /* Clean up if there was an error. */ + if (err) { + to->flags &= ~GV_PLEX_SYNCING; + printf("VINUM: error syncing plexes: error code %d\n", err); + } + + /* Check if all plexes are synced, and lower refcounts. */ + g_topology_lock(); + LIST_FOREACH(p, &v->plexes, in_volume) { + if (p->flags & GV_PLEX_SYNCING) { + g_topology_unlock(); + return (-1); + } + } + /* If we came here, all plexes are synced, and we're free. */ + gv_access(v->provider, -1, -1, 0); + g_topology_unlock(); + printf("VINUM: plex sync completed\n"); + + /* Issue all delayed requests. */ + bp = bioq_takefirst(v->wqueue); + while (bp != NULL) { + gv_volume_start(sc, bp); + bp = bioq_takefirst(v->wqueue); + } + return (0); +} - /* - * RAID5 plexes need special treatment, as a single write - * request involves several read/write sub-requests. - */ - if (p->org == GV_PLEX_RAID5) { - wp = g_malloc(sizeof(*wp), M_WAITOK | M_ZERO); - wp->bio = bp; - TAILQ_INIT(&wp->bits); - - if (bp->bio_cflags & GV_BIO_REBUILD) - err = gv_rebuild_raid5(p, wp, bp, addr, - boff, bcount); - else if (bp->bio_cflags & GV_BIO_CHECK) - err = gv_check_raid5(p, wp, bp, addr, - boff, bcount); - else - err = gv_build_raid5_req(p, wp, bp, addr, - boff, bcount); +/* + * Create a new bio struct for the next grow request. + */ +int +gv_grow_request(struct gv_plex *p, off_t offset, off_t length, int type, + caddr_t data) +{ + struct bio *bp; - /* - * Building the sub-request failed, we probably need to - * clean up a lot. - */ - if (err) { - printf("GEOM_VINUM: plex request failed for "); - g_print_bio(bp); - printf("\n"); - TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { - TAILQ_REMOVE(&wp->bits, bq, queue); - g_free(bq); - } - if (wp->waiting != NULL) { - if (wp->waiting->bio_cflags & - GV_BIO_MALLOC) - g_free(wp->waiting->bio_data); - g_destroy_bio(wp->waiting); - } - if (wp->parity != NULL) { - if (wp->parity->bio_cflags & - GV_BIO_MALLOC) - g_free(wp->parity->bio_data); - g_destroy_bio(wp->parity); - } - g_free(wp); + KASSERT(p != NULL, ("gv_grow_request: NULL p")); + bp = g_new_bio(); + if (bp == NULL) { + printf("VINUM: grow of %s failed creating bio: " + "out of memory\n", p->name); + return (ENOMEM); + } - TAILQ_FOREACH_SAFE(wp, &p->packets, list, wp2) { - if (wp->bio == bp) { - TAILQ_REMOVE(&p->packets, wp, - list); - TAILQ_FOREACH_SAFE(bq, - &wp->bits, queue, bq2) { - TAILQ_REMOVE(&wp->bits, - bq, queue); - g_free(bq); - } - g_free(wp); - } - } + bp->bio_cmd = type; + bp->bio_done = gv_done; + bp->bio_error = 0; + bp->bio_offset = offset; + bp->bio_length = length; + bp->bio_pflags |= GV_BIO_SYNCREQ; /* XXX: misuse of pflags AND syncreq.*/ + if (data == NULL) + data = g_malloc(length, M_WAITOK); + bp->bio_cflags |= GV_BIO_MALLOC; + bp->bio_data = data; - cbp = bp->bio_driver1; - while (cbp != NULL) { - pbp = cbp->bio_caller1; - if (cbp->bio_cflags & GV_BIO_MALLOC) - g_free(cbp->bio_data); - g_destroy_bio(cbp); - cbp = pbp; - } + /* Send down. */ + gv_plex_start(p, bp); + return (0); +} - g_io_deliver(bp, err); - return; - } - - if (TAILQ_EMPTY(&wp->bits)) - g_free(wp); - else if (wp->lockbase != -1) - TAILQ_INSERT_TAIL(&p->packets, wp, list); +/* + * Finish handling of a bio to a growing plex. + */ +void +gv_grow_complete(struct gv_plex *p, struct bio *bp) +{ + struct gv_sd *s; + struct gv_volume *v; + off_t origsize, offset; + int sdcount, err; - /* - * Requests to concatenated and striped plexes go straight - * through. - */ - } else { - err = gv_plexbuffer(p, bp, addr, boff, bcount); + v = p->vol_sc; + err = 0; - /* Building the sub-request failed. */ - if (err) { - printf("GEOM_VINUM: plex request failed for "); - g_print_bio(bp); - printf("\n"); - cbp = bp->bio_driver1; - while (cbp != NULL) { - pbp = cbp->bio_caller1; - g_destroy_bio(cbp); - cbp = pbp; - } - g_io_deliver(bp, err); - return; - } + /* If it was a read, write it. */ + if (bp->bio_cmd == BIO_READ) { + p->synced += bp->bio_length; + err = gv_grow_request(p, bp->bio_offset, bp->bio_length, + BIO_WRITE, bp->bio_data); + /* If it was a write, read next. */ + } else if (bp->bio_cmd == BIO_WRITE) { + if (bp->bio_cflags & GV_BIO_MALLOC) + g_free(bp->bio_data); + + /* Find the real size of the plex. */ + sdcount = gv_sdcount(p, 1); + s = LIST_FIRST(&p->subdisks); + /* XXX: should not ever happen */ + if (s == NULL) { + printf("VINUM: error growing plex without subdisks"); + return; + } + origsize = (s->size * (sdcount - 1)); + if (bp->bio_offset + bp->bio_length >= origsize) { + printf("VINUM: growing of %s completed\n", p->name); + p->flags &= ~GV_PLEX_GROWING; + LIST_FOREACH(s, &p->subdisks, in_plex) { + s->flags &= ~GV_SD_GROW; + gv_set_sd_state(s, GV_SD_UP, 0); + } + p->size = gv_plex_size(p); + gv_update_vol_size(v, gv_vol_size(v)); + gv_set_plex_state(p, GV_PLEX_UP, 0); + g_topology_lock(); + gv_access(v->provider, -1, -1, 0); + g_topology_unlock(); + p->synced = 0; + /* Issue delayed requests. */ + gv_plex_flush(p); + } else { + offset = bp->bio_offset + bp->bio_length; + err = gv_grow_request(p, offset, + MIN(bp->bio_length, origsize - offset), + BIO_READ, NULL); } - - /* Abuse bio_caller1 as linked list. */ - pbp = bp->bio_driver1; - while (pbp->bio_caller1 != NULL) - pbp = pbp->bio_caller1; - bcount -= pbp->bio_length; - addr += pbp->bio_length; - boff += pbp->bio_length; } + g_destroy_bio(bp); - /* Fire off all sub-requests. */ - pbp = bp->bio_driver1; - while (pbp != NULL) { - /* - * RAID5 sub-requests need to come in correct order, otherwise - * we trip over the parity, as it might be overwritten by - * another sub-request. - */ - if (pbp->bio_driver1 != NULL && - gv_stripe_active(p, pbp)) { - /* Park the bio on the waiting queue. */ - pbp->bio_cflags |= GV_BIO_ONHOLD; - mtx_lock(&p->bqueue_mtx); - bioq_disksort(p->wqueue, pbp); - mtx_unlock(&p->bqueue_mtx); - } else - g_io_request(pbp, pbp->bio_caller2); - pbp = pbp->bio_caller1; + if (err) { + p->flags &= ~GV_PLEX_GROWING; + printf("VINUM: error growing plex: error code %d\n", err); } } -static int -gv_plex_access(struct g_provider *pp, int dr, int dw, int de) + +/* + * Create an initialization BIO and send it off to the consumer. Assume that + * we're given initialization data as parameter. + */ +void +gv_init_request(struct gv_sd *s, off_t start, caddr_t data, off_t length) +{ + struct gv_drive *d; + struct g_consumer *cp; + struct bio *bp, *cbp; + + KASSERT(s != NULL, ("gv_init_request: NULL s")); + d = s->drive_sc; + KASSERT(d != NULL, ("gv_init_request: NULL d")); + cp = d->consumer; + KASSERT(cp != NULL, ("gv_init_request: NULL cp")); + + bp = g_new_bio(); + if (bp == NULL) { + printf("VINUM: subdisk '%s' init: write failed at offset %jd" + " (drive offset %jd); out of memory\n", s->name, + (intmax_t)s->initialized, (intmax_t)start); + return; /* XXX: Error codes. */ + } + bp->bio_cmd = BIO_WRITE; + bp->bio_data = data; + bp->bio_done = gv_done; + bp->bio_error = 0; + bp->bio_length = length; + bp->bio_cflags |= GV_BIO_INIT; + bp->bio_offset = start; + bp->bio_caller1 = s; + + /* Then ofcourse, we have to clone it. */ + cbp = g_clone_bio(bp); + if (cbp == NULL) { + printf("VINUM: subdisk '%s' init: write failed at offset %jd" + " (drive offset %jd); out of memory\n", s->name, + (intmax_t)s->initialized, (intmax_t)start); + return; /* XXX: Error codes. */ + } + cbp->bio_done = gv_done; + cbp->bio_caller1 = s; + /* Send it off to the consumer. */ + g_io_request(cbp, cp); +} + +/* + * Handle a finished initialization BIO. + */ +void +gv_init_complete(struct gv_plex *p, struct bio *bp) { - struct g_geom *gp; - struct g_consumer *cp, *cp2; + struct gv_drive *d; + struct g_consumer *cp; + struct gv_sd *s; + off_t start, length; + caddr_t data; int error; - gp = pp->geom; + s = bp->bio_caller1; + start = bp->bio_offset; + length = bp->bio_length; + error = bp->bio_error; + data = bp->bio_data; + + KASSERT(s != NULL, ("gv_init_complete: NULL s")); + d = s->drive_sc; + KASSERT(d != NULL, ("gv_init_complete: NULL d")); + cp = d->consumer; + KASSERT(cp != NULL, ("gv_init_complete: NULL cp")); - LIST_FOREACH(cp, &gp->consumer, consumer) { - error = g_access(cp, dr, dw, de); + g_destroy_bio(bp); + + /* + * First we need to find out if it was okay, and abort if it's not. + * Then we need to free previous buffers, find out the correct subdisk, + * as well as getting the correct starting point and length of the BIO. + */ + if (start >= s->drive_offset + s->size) { + /* Free the data we initialized. */ + if (data != NULL) + g_free(data); + g_topology_assert_not(); + g_topology_lock(); + g_access(cp, 0, -1, 0); + g_topology_unlock(); if (error) { - LIST_FOREACH(cp2, &gp->consumer, consumer) { - if (cp == cp2) - break; - g_access(cp2, -dr, -dw, -de); - } - return (error); + gv_set_sd_state(s, GV_SD_STALE, GV_SETSTATE_FORCE | + GV_SETSTATE_CONFIG); + } else { + gv_set_sd_state(s, GV_SD_UP, GV_SETSTATE_CONFIG); + s->initialized = 0; + printf("VINUM: subdisk '%s' init: finished " + "successfully\n", s->name); } + return; } - return (0); + s->initialized += length; + start += length; + gv_init_request(s, start, data, length); } -static struct g_geom * -gv_plex_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) +/* + * Create a new bio struct for the next parity rebuild. Used both by internal + * rebuild of degraded plexes as well as user initiated rebuilds/checks. + */ +void +gv_parity_request(struct gv_plex *p, int flags, off_t offset) { - struct g_geom *gp; - struct g_consumer *cp, *cp2; - struct g_provider *pp2; - struct gv_plex *p; - struct gv_sd *s; - struct gv_softc *sc; - int error; + struct bio *bp; - g_trace(G_T_TOPOLOGY, "gv_plex_taste(%s, %s)", mp->name, pp->name); - g_topology_assert(); + KASSERT(p != NULL, ("gv_parity_request: NULL p")); - /* We only want to attach to subdisks. */ - if (strcmp(pp->geom->class->name, "VINUMDRIVE")) - return (NULL); - - /* Find the VINUM class and its associated geom. */ - gp = find_vinum_geom(); - if (gp == NULL) - return (NULL); - sc = gp->softc; - KASSERT(sc != NULL, ("gv_plex_taste: NULL sc")); - - /* Find out which subdisk the offered provider corresponds to. */ - s = pp->private; - KASSERT(s != NULL, ("gv_plex_taste: NULL s")); - - /* Now find the correct plex where this subdisk belongs to. */ - p = gv_find_plex(sc, s->plex); - if (p == NULL) { - printf("gv_plex_taste: NULL p for '%s'\n", s->name); - return (NULL); + bp = g_new_bio(); + if (bp == NULL) { + printf("VINUM: rebuild of %s failed creating bio: " + "out of memory\n", p->name); + return; } + bp->bio_cmd = BIO_WRITE; + bp->bio_done = gv_done; + bp->bio_error = 0; + bp->bio_length = p->stripesize; + /* - * Add this subdisk to this plex. Since we trust the on-disk - * configuration, we don't check the given value (should we?). - * XXX: shouldn't be done here + * Check if it's a rebuild of a degraded plex or a user request of + * parity rebuild. */ - gv_sd_to_plex(p, s, 0); + if (flags & GV_BIO_REBUILD) + bp->bio_data = g_malloc(GV_DFLT_SYNCSIZE, M_WAITOK); + else if (flags & GV_BIO_CHECK) + bp->bio_data = g_malloc(p->stripesize, M_WAITOK | M_ZERO); + else { + printf("VINUM: invalid flags given\n"); + return; + } - /* Now check if there's already a geom for this plex. */ - gp = p->geom; + bp->bio_cflags = flags; + bp->bio_cflags |= GV_BIO_MALLOC; - /* Yes, there is already a geom, so we just add the consumer. */ - if (gp != NULL) { - cp2 = LIST_FIRST(&gp->consumer); - /* Need to attach a new consumer to this subdisk. */ - cp = g_new_consumer(gp); - error = g_attach(cp, pp); - if (error) { - printf("geom_vinum: couldn't attach consumer to %s\n", - pp->name); - g_destroy_consumer(cp); - return (NULL); - } - /* Adjust the access counts of the new consumer. */ - if ((cp2 != NULL) && (cp2->acr || cp2->acw || cp2->ace)) { - error = g_access(cp, cp2->acr, cp2->acw, cp2->ace); - if (error) { - printf("geom_vinum: couldn't set access counts" - " for consumer on %s\n", pp->name); - g_detach(cp); - g_destroy_consumer(cp); - return (NULL); - } - } - s->consumer = cp; + /* We still have more parity to build. */ + bp->bio_offset = offset; - /* Adjust the size of the providers this plex has. */ - LIST_FOREACH(pp2, &gp->provider, provider) - pp2->mediasize = p->size; - - /* Update the size of the volume this plex is attached to. */ - if (p->vol_sc != NULL) - gv_update_vol_size(p->vol_sc, p->size); + gv_plex_start(p, bp); /* Send it down to the plex. */ +} - /* - * If necessary, create bio queues, queue mutex and a worker - * thread. - */ - if (p->bqueue == NULL) { - p->bqueue = g_malloc(sizeof(struct bio_queue_head), - M_WAITOK | M_ZERO); - bioq_init(p->bqueue); - } - if (p->wqueue == NULL) { - p->wqueue = g_malloc(sizeof(struct bio_queue_head), - M_WAITOK | M_ZERO); - bioq_init(p->wqueue); - } - if (mtx_initialized(&p->bqueue_mtx) == 0) - mtx_init(&p->bqueue_mtx, "gv_plex", NULL, MTX_DEF); - if (!(p->flags & GV_PLEX_THREAD_ACTIVE)) { - kthread_create(gv_plex_worker, p, NULL, 0, 0, "gv_p %s", - p->name); - p->flags |= GV_PLEX_THREAD_ACTIVE; - } +/* + * Handle a finished parity write. + */ +void +gv_parity_complete(struct gv_plex *p, struct bio *bp) +{ + int error, flags; + + error = bp->bio_error; + flags = bp->bio_cflags; + flags &= ~GV_BIO_MALLOC; - return (NULL); + /* Clean up what we allocated. */ + if (bp->bio_cflags & GV_BIO_MALLOC) + g_free(bp->bio_data); + g_destroy_bio(bp); - /* We need to create a new geom. */ + if (error) { + /* Make sure we don't have the lock. */ + g_topology_assert_not(); + g_topology_lock(); + gv_access(p->vol_sc->provider, -1, -1, 0); + g_topology_unlock(); + + if (error == EAGAIN) { + printf("VINUM: Parity incorrect at offset 0x%jx\n", + (intmax_t)p->synced); + if (!(flags & GV_BIO_PARITY)) + return; + } + printf("VINUM: Parity check on %s failed at 0x%jx errno %d\n", + p->name, (intmax_t)p->synced, error); } else { - gp = g_new_geomf(mp, "%s", p->name); - gp->start = gv_plex_start; - gp->orphan = gv_plex_orphan; - gp->access = gv_plex_access; - gp->softc = p; - p->geom = gp; - - TAILQ_INIT(&p->packets); - p->bqueue = g_malloc(sizeof(struct bio_queue_head), - M_WAITOK | M_ZERO); - bioq_init(p->bqueue); - p->wqueue = g_malloc(sizeof(struct bio_queue_head), - M_WAITOK | M_ZERO); - bioq_init(p->wqueue); - mtx_init(&p->bqueue_mtx, "gv_plex", NULL, MTX_DEF); - kthread_create(gv_plex_worker, p, NULL, 0, 0, "gv_p %s", - p->name); - p->flags |= GV_PLEX_THREAD_ACTIVE; - - /* Attach a consumer to this provider. */ - cp = g_new_consumer(gp); - g_attach(cp, pp); - s->consumer = cp; - - /* Create a provider for the outside world. */ - pp2 = g_new_providerf(gp, "gvinum/plex/%s", p->name); - pp2->mediasize = p->size; - pp2->sectorsize = pp->sectorsize; - p->provider = pp2; - g_error_provider(pp2, 0); - return (gp); + p->synced += p->stripesize; } -} -static int -gv_plex_destroy_geom(struct gctl_req *req, struct g_class *mp, - struct g_geom *gp) -{ - struct gv_plex *p; + if (p->synced >= p->size) { + /* Make sure we don't have the lock. */ + g_topology_assert_not(); + g_topology_lock(); + gv_access(p->vol_sc->provider, -1, -1, 0); + g_topology_unlock(); + + /* We're finished. */ + printf("VINUM: Parity operation on %s finished\n", p->name); + p->synced = 0; + return; + } - g_trace(G_T_TOPOLOGY, "gv_plex_destroy_geom: %s", gp->name); - g_topology_assert(); + /* Send down next. It will determine if we need to itself. */ + gv_parity_request(p, flags, p->synced); +} - p = gp->softc; +/* + * Handle a finished plex rebuild bio. + */ +void +gv_rebuild_complete(struct gv_plex *p, struct bio *bp) +{ + struct gv_sd *s; + int error, flags; + off_t offset; - KASSERT(p != NULL, ("gv_plex_destroy_geom: null p of '%s'", gp->name)); + error = bp->bio_error; + flags = bp->bio_cflags; + offset = bp->bio_offset; + flags &= ~GV_BIO_MALLOC; - /* - * If this is a RAID5 plex, check if its worker thread is still active - * and signal it to self destruct. - */ - gv_kill_plex_thread(p); - /* g_free(sc); */ - g_wither_geom(gp, ENXIO); - return (0); -} + /* Clean up what we allocated. */ + if (bp->bio_cflags & GV_BIO_MALLOC) + g_free(bp->bio_data); + g_destroy_bio(bp); -#define VINUMPLEX_CLASS_NAME "VINUMPLEX" + if (error) { + g_topology_assert_not(); + g_topology_lock(); + gv_access(p->vol_sc->provider, -1, -1, 0); + g_topology_unlock(); + + printf("VINUM: rebuild of %s failed at offset %jd errno: %d\n", + p->name, (intmax_t)offset, error); + p->flags &= ~GV_PLEX_REBUILDING; + p->synced = 0; + gv_plex_flush(p); /* Flush out remaining rebuild BIOs. */ + return; + } -static struct g_class g_vinum_plex_class = { - .name = VINUMPLEX_CLASS_NAME, - .version = G_VERSION, - .taste = gv_plex_taste, - .destroy_geom = gv_plex_destroy_geom, -}; + offset += (p->stripesize * (gv_sdcount(p, 1) - 1)); + if (offset >= p->size) { + /* We're finished. */ + g_topology_assert_not(); + g_topology_lock(); + gv_access(p->vol_sc->provider, -1, -1, 0); + g_topology_unlock(); + + printf("VINUM: rebuild of %s finished\n", p->name); + gv_save_config(p->vinumconf); + p->flags &= ~GV_PLEX_REBUILDING; + p->synced = 0; + /* Try to up all subdisks. */ + LIST_FOREACH(s, &p->subdisks, in_plex) + gv_update_sd_state(s); + gv_plex_flush(p); /* Flush out remaining rebuild BIOs. */ + return; + } -DECLARE_GEOM_CLASS(g_vinum_plex_class, g_vinum_plex); + /* Send down next. It will determine if we need to itself. */ + gv_parity_request(p, flags, offset); +} Index: sys/geom/vinum/geom_vinum_raid5.c =================================================================== RCS file: /srv/ncvs/src/sys/geom/vinum/geom_vinum_raid5.c,v retrieving revision 1.10 diff -u -u -r1.10 geom_vinum_raid5.c --- sys/geom/vinum/geom_vinum_raid5.c 26 Nov 2004 11:59:51 -0000 1.10 +++ sys/geom/vinum/geom_vinum_raid5.c 3 Nov 2007 02:40:17 -0000 @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2004 Lukas Ertl + * Copyright (c) 2004, 2007 Lukas Ertl * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -29,14 +29,8 @@ #include #include -#include -#include -#include -#include -#include #include #include -#include #include #include @@ -44,8 +38,93 @@ #include #include -int gv_raid5_offset(struct gv_plex *, off_t, off_t, off_t *, off_t *, - int *, int *); +static int gv_raid5_offset(struct gv_plex *, off_t, off_t, + off_t *, off_t *, int *, int *, int); +static struct bio * gv_raid5_clone_bio(struct bio *, struct gv_sd *, + struct gv_raid5_packet *, caddr_t, int); +static int gv_raid5_request(struct gv_plex *, struct gv_raid5_packet *, + struct bio *, caddr_t, off_t, off_t, int *); +static int gv_raid5_check(struct gv_plex *, struct gv_raid5_packet *, + struct bio *, caddr_t, off_t, off_t); +static int gv_raid5_rebuild(struct gv_plex *, struct gv_raid5_packet *, + struct bio *, caddr_t, off_t, off_t); + +struct gv_raid5_packet * +gv_raid5_start(struct gv_plex *p, struct bio *bp, caddr_t addr, off_t boff, + off_t bcount) +{ + struct bio *cbp; + struct gv_raid5_packet *wp, *wp2; + struct gv_bioq *bq, *bq2; + int err, delay; + + delay = 0; + wp = g_malloc(sizeof(*wp), M_WAITOK | M_ZERO); + wp->bio = bp; + TAILQ_INIT(&wp->bits); + + if (bp->bio_cflags & GV_BIO_REBUILD) + err = gv_raid5_rebuild(p, wp, bp, addr, boff, bcount); + else if (bp->bio_cflags & GV_BIO_CHECK) + err = gv_raid5_check(p, wp, bp, addr, boff, bcount); + else + err = gv_raid5_request(p, wp, bp, addr, boff, bcount, &delay); + + /* Means we have a delayed request. */ + if (delay) { + g_free(wp); + return (NULL); + } + + /* + * Building the sub-request failed, we probably need to clean up a lot. + */ + if (err) { + printf("VINUM: plex request failed for "); + g_print_bio(bp); + printf("\n"); + TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { + TAILQ_REMOVE(&wp->bits, bq, queue); + g_free(bq); + } + if (wp->waiting != NULL) { + if (wp->waiting->bio_cflags & GV_BIO_MALLOC) + g_free(wp->waiting->bio_data); + g_destroy_bio(wp->waiting); + } + if (wp->parity != NULL) { + if (wp->parity->bio_cflags & GV_BIO_MALLOC) + g_free(wp->parity->bio_data); + g_destroy_bio(wp->parity); + } + g_free(wp); + + TAILQ_FOREACH_SAFE(wp, &p->packets, list, wp2) { + if (wp->bio != bp) + continue; + + TAILQ_REMOVE(&p->packets, wp, list); + TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { + TAILQ_REMOVE(&wp->bits, bq, queue); + g_free(bq); + } + g_free(wp); + } + + cbp = bioq_takefirst(p->bqueue); + while (cbp != NULL) { + if (cbp->bio_cflags & GV_BIO_MALLOC) + g_free(cbp->bio_data); + g_destroy_bio(cbp); + cbp = bioq_takefirst(p->bqueue); + } + + g_io_deliver(bp, err); + return (NULL); + } + + return (wp); +} /* * Check if the stripe that the work packet wants is already being used by @@ -57,7 +136,7 @@ struct gv_raid5_packet *wp, *owp; int overlap; - wp = bp->bio_driver1; + wp = bp->bio_caller2; if (wp->lockbase == -1) return (0); @@ -80,20 +159,20 @@ return (overlap); } -int -gv_check_raid5(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, +static int +gv_raid5_check(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr, off_t boff, off_t bcount) { struct gv_sd *parity, *s; struct gv_bioq *bq; - struct bio *cbp, *pbp; + struct bio *cbp; int i, psdno; off_t real_len, real_off; if (p == NULL || LIST_EMPTY(&p->subdisks)) return (ENXIO); - gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, &psdno); + gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, &psdno, 1); /* Find the right subdisk. */ parity = NULL; @@ -122,20 +201,16 @@ /* Skip the parity subdisk. */ if (s == parity) continue; + /* Skip growing subdisks. */ + if (s->flags & GV_SD_GROW) + continue; - cbp = g_clone_bio(bp); + cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); cbp->bio_cmd = BIO_READ; - cbp->bio_data = g_malloc(real_len, M_WAITOK); - cbp->bio_cflags |= GV_BIO_MALLOC; - cbp->bio_offset = real_off; - cbp->bio_length = real_len; - cbp->bio_done = gv_plex_done; - cbp->bio_caller2 = s->consumer; - cbp->bio_driver1 = wp; - GV_ENQUEUE(bp, cbp, pbp); + bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; @@ -143,51 +218,38 @@ } /* Read the parity data. */ - cbp = g_clone_bio(bp); + cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); cbp->bio_cmd = BIO_READ; - cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO); - cbp->bio_cflags |= GV_BIO_MALLOC; - cbp->bio_offset = real_off; - cbp->bio_length = real_len; - cbp->bio_done = gv_plex_done; - cbp->bio_caller2 = parity->consumer; - cbp->bio_driver1 = wp; wp->waiting = cbp; /* * In case we want to rebuild the parity, create an extra BIO to write * it out. It also acts as buffer for the XOR operations. */ - cbp = g_clone_bio(bp); + cbp = gv_raid5_clone_bio(bp, parity, wp, addr, 1); if (cbp == NULL) return (ENOMEM); - cbp->bio_data = addr; - cbp->bio_offset = real_off; - cbp->bio_length = real_len; - cbp->bio_done = gv_plex_done; - cbp->bio_caller2 = parity->consumer; - cbp->bio_driver1 = wp; wp->parity = cbp; return (0); } /* Rebuild a degraded RAID5 plex. */ -int -gv_rebuild_raid5(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, +static int +gv_raid5_rebuild(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr, off_t boff, off_t bcount) { struct gv_sd *broken, *s; struct gv_bioq *bq; - struct bio *cbp, *pbp; + struct bio *cbp; off_t real_len, real_off; if (p == NULL || LIST_EMPTY(&p->subdisks)) return (ENXIO); - gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, NULL); + gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, NULL, 1); /* Find the right subdisk. */ broken = NULL; @@ -210,6 +272,8 @@ printf("GEOM_VINUM: sd %s is reviving\n", broken->name); gv_set_sd_state(broken, GV_SD_REVIVING, GV_SETSTATE_FORCE); + /* Set this bit now, but should be set at end. */ + broken->flags |= GV_SD_CANGOUP; break; case GV_SD_REVIVING: @@ -232,19 +296,16 @@ if (s == broken) continue; - cbp = g_clone_bio(bp); + /* Skip growing subdisks. */ + if (s->flags & GV_SD_GROW) + continue; + + cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); cbp->bio_cmd = BIO_READ; - cbp->bio_data = g_malloc(real_len, M_WAITOK); - cbp->bio_cflags |= GV_BIO_MALLOC; - cbp->bio_offset = real_off; - cbp->bio_length = real_len; - cbp->bio_done = gv_plex_done; - cbp->bio_caller2 = s->consumer; - cbp->bio_driver1 = wp; - GV_ENQUEUE(bp, cbp, pbp); + bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; @@ -252,34 +313,28 @@ } /* Write the parity data. */ - cbp = g_clone_bio(bp); + cbp = gv_raid5_clone_bio(bp, broken, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); - cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO); - cbp->bio_cflags |= GV_BIO_MALLOC; - cbp->bio_offset = real_off; - cbp->bio_length = real_len; - cbp->bio_done = gv_plex_done; - cbp->bio_caller2 = broken->consumer; - cbp->bio_driver1 = wp; cbp->bio_cflags |= GV_BIO_REBUILD; wp->parity = cbp; p->synced = boff; + /* Post notification that we're finished. */ return (0); } /* Build a request group to perform (part of) a RAID5 request. */ -int -gv_build_raid5_req(struct gv_plex *p, struct gv_raid5_packet *wp, - struct bio *bp, caddr_t addr, off_t boff, off_t bcount) +static int +gv_raid5_request(struct gv_plex *p, struct gv_raid5_packet *wp, + struct bio *bp, caddr_t addr, off_t boff, off_t bcount, int *delay) { struct g_geom *gp; struct gv_sd *broken, *original, *parity, *s; struct gv_bioq *bq; - struct bio *cbp, *pbp; - int i, psdno, sdno, type; + struct bio *cbp; + int i, psdno, sdno, type, grow; off_t real_len, real_off; gp = bp->bio_to->geom; @@ -295,7 +350,24 @@ type = REQ_TYPE_NORMAL; original = parity = broken = NULL; - gv_raid5_offset(p, boff, bcount, &real_off, &real_len, &sdno, &psdno); + /* XXX: The resize won't crash with rebuild or sync, but we should still + * be aware of it. Also this should perhaps be done on rebuild/check as + * well? + */ + /* If we're over, we must use the old. */ + if (boff >= p->synced) { + grow = 1; + /* Or if over the resized offset, we use all drives. */ + } else if (boff + bcount <= p->synced) { + grow = 0; + /* Else, we're in the middle, and must wait a bit. */ + } else { + bioq_disksort(p->rqueue, bp); + *delay = 1; + return (0); + } + gv_raid5_offset(p, boff, bcount, &real_off, &real_len, + &sdno, &psdno, grow); /* Find the right subdisks. */ i = 0; @@ -330,9 +402,15 @@ KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0")); - if ((p->flags & GV_PLEX_SYNCING) && (boff + real_len < p->synced)) + if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len < p->synced)) type = REQ_TYPE_NORMAL; + if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len >= p->synced)) { + bioq_disksort(p->rqueue, bp); + *delay = 1; + return (0); + } + switch (bp->bio_cmd) { case BIO_READ: /* @@ -346,18 +424,14 @@ /* Skip the broken subdisk. */ if (s == broken) continue; - cbp = g_clone_bio(bp); + /* Skip growing if within offset. */ + if (grow && s->flags & GV_SD_GROW) + continue; + cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); - cbp->bio_data = g_malloc(real_len, M_WAITOK); - cbp->bio_cflags |= GV_BIO_MALLOC; - cbp->bio_offset = real_off; - cbp->bio_length = real_len; - cbp->bio_done = gv_plex_done; - cbp->bio_caller2 = s->consumer; - cbp->bio_driver1 = wp; - GV_ENQUEUE(bp, cbp, pbp); + bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; @@ -366,16 +440,11 @@ /* A normal read can be fulfilled with the original subdisk. */ } else { - cbp = g_clone_bio(bp); + cbp = gv_raid5_clone_bio(bp, original, wp, addr, 0); if (cbp == NULL) return (ENOMEM); - cbp->bio_offset = real_off; - cbp->bio_length = real_len; - cbp->bio_data = addr; - cbp->bio_done = g_std_done; - cbp->bio_caller2 = original->consumer; - GV_ENQUEUE(bp, cbp, pbp); + bioq_insert_tail(p->bqueue, cbp); } wp->lockbase = -1; @@ -394,20 +463,16 @@ /* Skip the broken and the parity subdisk. */ if ((s == broken) || (s == parity)) continue; + /* Skip growing if within offset. */ + if (grow && s->flags & GV_SD_GROW) + continue; - cbp = g_clone_bio(bp); + cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); cbp->bio_cmd = BIO_READ; - cbp->bio_data = g_malloc(real_len, M_WAITOK); - cbp->bio_cflags |= GV_BIO_MALLOC; - cbp->bio_offset = real_off; - cbp->bio_length = real_len; - cbp->bio_done = gv_plex_done; - cbp->bio_caller2 = s->consumer; - cbp->bio_driver1 = wp; - GV_ENQUEUE(bp, cbp, pbp); + bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; @@ -415,34 +480,21 @@ } /* Write the parity data. */ - cbp = g_clone_bio(bp); + cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); - cbp->bio_data = g_malloc(real_len, M_WAITOK); - cbp->bio_cflags |= GV_BIO_MALLOC; - bcopy(addr, cbp->bio_data, real_len); - cbp->bio_offset = real_off; - cbp->bio_length = real_len; - cbp->bio_done = gv_plex_done; - cbp->bio_caller2 = parity->consumer; - cbp->bio_driver1 = wp; + bcopy(addr, cbp->bio_data, wp->length); wp->parity = cbp; /* * When the parity stripe is missing we just write out the data. */ } else if (type == REQ_TYPE_NOPARITY) { - cbp = g_clone_bio(bp); + cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1); if (cbp == NULL) return (ENOMEM); - cbp->bio_offset = real_off; - cbp->bio_length = real_len; - cbp->bio_data = addr; - cbp->bio_done = gv_plex_done; - cbp->bio_caller2 = original->consumer; - cbp->bio_driver1 = wp; - GV_ENQUEUE(bp, cbp, pbp); + bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; @@ -455,54 +507,33 @@ */ } else { /* Read old parity. */ - cbp = g_clone_bio(bp); + cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); cbp->bio_cmd = BIO_READ; - cbp->bio_data = g_malloc(real_len, M_WAITOK); - cbp->bio_cflags |= GV_BIO_MALLOC; - cbp->bio_offset = real_off; - cbp->bio_length = real_len; - cbp->bio_done = gv_plex_done; - cbp->bio_caller2 = parity->consumer; - cbp->bio_driver1 = wp; - GV_ENQUEUE(bp, cbp, pbp); + bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; TAILQ_INSERT_TAIL(&wp->bits, bq, queue); /* Read old data. */ - cbp = g_clone_bio(bp); + cbp = gv_raid5_clone_bio(bp, original, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); cbp->bio_cmd = BIO_READ; - cbp->bio_data = g_malloc(real_len, M_WAITOK); - cbp->bio_cflags |= GV_BIO_MALLOC; - cbp->bio_offset = real_off; - cbp->bio_length = real_len; - cbp->bio_done = gv_plex_done; - cbp->bio_caller2 = original->consumer; - cbp->bio_driver1 = wp; - GV_ENQUEUE(bp, cbp, pbp); + bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; TAILQ_INSERT_TAIL(&wp->bits, bq, queue); /* Write new data. */ - cbp = g_clone_bio(bp); + cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1); if (cbp == NULL) return (ENOMEM); - cbp->bio_data = addr; - cbp->bio_offset = real_off; - cbp->bio_length = real_len; - cbp->bio_done = gv_plex_done; - cbp->bio_caller2 = original->consumer; - - cbp->bio_driver1 = wp; /* * We must not write the new data until the old data @@ -512,16 +543,9 @@ wp->waiting = cbp; /* The final bio for the parity. */ - cbp = g_clone_bio(bp); + cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); - cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO); - cbp->bio_cflags |= GV_BIO_MALLOC; - cbp->bio_offset = real_off; - cbp->bio_length = real_len; - cbp->bio_done = gv_plex_done; - cbp->bio_caller2 = parity->consumer; - cbp->bio_driver1 = wp; /* Remember that this is the BIO for the parity data. */ wp->parity = cbp; @@ -535,21 +559,36 @@ return (0); } -/* Calculate the offsets in the various subdisks for a RAID5 request. */ -int +/* + * Calculate the offsets in the various subdisks for a RAID5 request. Also take + * care of new subdisks in an expanded RAID5 array. + * XXX: This assumes that the new subdisks are inserted after the others (which + * is okay as long as plex_offset is larger). If subdisks are inserted into the + * plexlist before, we get problems. + */ +static int gv_raid5_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off, - off_t *real_len, int *sdno, int *psdno) + off_t *real_len, int *sdno, int *psdno, int growing) { - int sd, psd; + struct gv_sd *s; + int sd, psd, sdcount; off_t len_left, stripeend, stripeoff, stripestart; + sdcount = p->sdcount; + if (growing) { + LIST_FOREACH(s, &p->subdisks, in_plex) { + if (s->flags & GV_SD_GROW) + sdcount--; + } + } + /* The number of the subdisk containing the parity stripe. */ - psd = p->sdcount - 1 - ( boff / (p->stripesize * (p->sdcount - 1))) % - p->sdcount; + psd = sdcount - 1 - ( boff / (p->stripesize * (sdcount - 1))) % + sdcount; KASSERT(psdno >= 0, ("gv_raid5_offset: psdno < 0")); /* Offset of the start address from the start of the stripe. */ - stripeoff = boff % (p->stripesize * (p->sdcount - 1)); + stripeoff = boff % (p->stripesize * (sdcount - 1)); KASSERT(stripeoff >= 0, ("gv_raid5_offset: stripeoff < 0")); /* The number of the subdisk where the stripe resides. */ @@ -561,7 +600,7 @@ sd++; /* The offset of the stripe on this subdisk. */ - stripestart = (boff - stripeoff) / (p->sdcount - 1); + stripestart = (boff - stripeoff) / (sdcount - 1); KASSERT(stripestart >= 0, ("gv_raid5_offset: stripestart < 0")); stripeoff %= p->stripesize; @@ -582,3 +621,27 @@ return (0); } + +static struct bio * +gv_raid5_clone_bio(struct bio *bp, struct gv_sd *s, struct gv_raid5_packet *wp, + caddr_t addr, int use_wp) +{ + struct bio *cbp; + + cbp = g_clone_bio(bp); + if (cbp == NULL) + return (NULL); + if (addr == NULL) { + cbp->bio_data = g_malloc(wp->length, M_WAITOK | M_ZERO); + cbp->bio_cflags |= GV_BIO_MALLOC; + } else + cbp->bio_data = addr; + cbp->bio_offset = wp->lockbase + s->drive_offset; + cbp->bio_length = wp->length; + cbp->bio_done = gv_done; + cbp->bio_caller1 = s; + if (use_wp) + cbp->bio_caller2 = wp; + + return (cbp); +} Index: sys/geom/vinum/geom_vinum_raid5.h =================================================================== RCS file: /srv/ncvs/src/sys/geom/vinum/geom_vinum_raid5.h,v retrieving revision 1.7 diff -u -u -r1.7 geom_vinum_raid5.h --- sys/geom/vinum/geom_vinum_raid5.h 17 Aug 2006 22:50:33 -0000 1.7 +++ sys/geom/vinum/geom_vinum_raid5.h 3 Nov 2007 02:40:17 -0000 @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2004 Lukas Ertl + * Copyright (c) 2004, 2007 Lukas Ertl * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -35,26 +35,10 @@ * transaction (read or write). */ -#define GV_ENQUEUE(bp, cbp, pbp) \ - do { \ - if (bp->bio_driver1 == NULL) { \ - bp->bio_driver1 = cbp; \ - } else { \ - pbp = bp->bio_driver1; \ - while (pbp->bio_caller1 != NULL) \ - pbp = pbp->bio_caller1; \ - pbp->bio_caller1 = cbp; \ - } \ - } while (0) - struct gv_raid5_packet { caddr_t data; /* Data buffer of this sub-request- */ off_t length; /* Size of data buffer. */ off_t lockbase; /* Deny access to our plex offset. */ - off_t offset; /* The drive offset of the subdisk. */ - int bufmalloc; /* Flag if data buffer was malloced. */ - int active; /* Count of active subrequests. */ - int rqcount; /* Count of subrequests. */ struct bio *bio; /* Pointer to the original bio. */ struct bio *parity; /* The bio containing the parity data. */ @@ -64,14 +48,7 @@ TAILQ_ENTRY(gv_raid5_packet) list; /* Entry in plex's packet list. */ }; -int gv_stripe_active(struct gv_plex *, struct bio *); -int gv_build_raid5_req(struct gv_plex *, struct gv_raid5_packet *, - struct bio *, caddr_t, off_t, off_t); -int gv_check_raid5(struct gv_plex *, struct gv_raid5_packet *, - struct bio *, caddr_t, off_t, off_t); -int gv_rebuild_raid5(struct gv_plex *, struct gv_raid5_packet *, - struct bio *, caddr_t, off_t, off_t); -void gv_raid5_worker(void *); -void gv_plex_done(struct bio *); +struct gv_raid5_packet * gv_raid5_start(struct gv_plex *, struct bio *, + caddr_t, off_t, off_t); #endif /* !_GEOM_VINUM_RAID5_H_ */ Index: sys/geom/vinum/geom_vinum_rename.c =================================================================== RCS file: /srv/ncvs/src/sys/geom/vinum/geom_vinum_rename.c,v retrieving revision 1.3 diff -u -u -r1.3 geom_vinum_rename.c --- sys/geom/vinum/geom_vinum_rename.c 20 Nov 2005 12:14:18 -0000 1.3 +++ sys/geom/vinum/geom_vinum_rename.c 3 Nov 2007 02:40:17 -0000 @@ -34,22 +34,11 @@ #include #include -#include #include #include #include #include -#include - -static int gv_rename_drive(struct gv_softc *, struct gctl_req *, - struct gv_drive *, char *, int); -static int gv_rename_plex(struct gv_softc *, struct gctl_req *, - struct gv_plex *, char *, int); -static int gv_rename_sd(struct gv_softc *, struct gctl_req *, - struct gv_sd *, char *, int); -static int gv_rename_vol(struct gv_softc *, struct gctl_req *, - struct gv_volume *, char *, int); void gv_rename(struct g_geom *gp, struct gctl_req *req) @@ -59,8 +48,8 @@ struct gv_plex *p; struct gv_sd *s; struct gv_drive *d; - char *newname, *object; - int err, *flags, type; + char *newname, *object, *name; + int *flags, type; sc = gp->softc; @@ -86,9 +75,9 @@ gctl_error(req, "unknown volume '%s'", object); return; } - err = gv_rename_vol(sc, req, v, newname, *flags); - if (err) - return; + name = g_malloc(GV_MAXVOLNAME, M_WAITOK | M_ZERO); + strlcpy(name, newname, GV_MAXVOLNAME); + gv_post_event(sc, GV_EVENT_RENAME_VOL, v, name, *flags, 0); break; case GV_TYPE_PLEX: p = gv_find_plex(sc, object); @@ -96,9 +85,9 @@ gctl_error(req, "unknown plex '%s'", object); return; } - err = gv_rename_plex(sc, req, p, newname, *flags); - if (err) - return; + name = g_malloc(GV_MAXPLEXNAME, M_WAITOK | M_ZERO); + strlcpy(name, newname, GV_MAXPLEXNAME); + gv_post_event(sc, GV_EVENT_RENAME_PLEX, p, name, *flags, 0); break; case GV_TYPE_SD: s = gv_find_sd(sc, object); @@ -106,9 +95,9 @@ gctl_error(req, "unknown subdisk '%s'", object); return; } - err = gv_rename_sd(sc, req, s, newname, *flags); - if (err) - return; + name = g_malloc(GV_MAXSDNAME, M_WAITOK | M_ZERO); + strlcpy(name, newname, GV_MAXSDNAME); + gv_post_event(sc, GV_EVENT_RENAME_SD, s, name, *flags, 0); break; case GV_TYPE_DRIVE: d = gv_find_drive(sc, object); @@ -116,43 +105,40 @@ gctl_error(req, "unknown drive '%s'", object); return; } - err = gv_rename_drive(sc, req, d, newname, *flags); - if (err) - return; + name = g_malloc(GV_MAXDRIVENAME, M_WAITOK | M_ZERO); + strlcpy(name, newname, GV_MAXDRIVENAME); + gv_post_event(sc, GV_EVENT_RENAME_DRIVE, d, name, *flags, 0); break; default: gctl_error(req, "unknown object '%s'", object); return; } - - gv_save_config_all(sc); } -static int -gv_rename_drive(struct gv_softc *sc, struct gctl_req *req, struct gv_drive *d, char *newname, int flags) +int +gv_rename_drive(struct gv_softc *sc, struct gv_drive *d, char *newname, + int flags) { struct gv_sd *s; g_topology_assert(); KASSERT(d != NULL, ("gv_rename_drive: NULL d")); - if (gv_object_type(sc, newname) != -1) { - gctl_error(req, "drive name '%s' already in use", newname); - return (-1); + if (gv_object_type(sc, newname) != GV_ERR_NOTFOUND) { + printf("VINUM: drive name '%s' already in use\n", newname); + return (GV_ERR_NAMETAKEN); } strncpy(d->name, newname, GV_MAXDRIVENAME); - /* XXX can we rename providers here? */ - LIST_FOREACH(s, &d->subdisks, from_drive) strncpy(s->drive, d->name, GV_MAXDRIVENAME); return (0); } -static int -gv_rename_plex(struct gv_softc *sc, struct gctl_req *req, struct gv_plex *p, char *newname, int flags) +int +gv_rename_plex(struct gv_softc *sc, struct gv_plex *p, char *newname, int flags) { struct gv_sd *s; char *plexnum, *plexnump, *oldplex, *oldplexp; @@ -164,9 +150,9 @@ err = 0; - if (gv_object_type(sc, newname) != -1) { - gctl_error(req, "plex name '%s' already in use", newname); - return (-1); + if (gv_object_type(sc, newname) != GV_ERR_NOTFOUND) { + printf("VINUM: plex name '%s' already in use\n", newname); + return (GV_ERR_NAMETAKEN); } /* Needed for sanity checking. */ @@ -187,15 +173,15 @@ strsep(&oldplexp, "."); strsep(&plexnump, "."); if (plexnump == NULL || *plexnump == '\0') { - gctl_error(req, "proposed plex name '%s' is not a valid plex " - "name", newname); - err = -1; + printf("VINUM: proposed plex name '%s' is not a valid plex " + "name\n", newname); + err = GV_ERR_INVNAME; goto failure; } if (strcmp(oldplexp, plexnump)) { - gctl_error(req, "current and proposed plex numbers (%s, %s) " - "do not match", plexnump, oldplexp); - err = -1; + printf("VINUM: current and proposed plex numbers (%s, %s) " + "do not match\n", plexnump, oldplexp); + err = GV_ERR_INVNAME; goto failure; } @@ -218,7 +204,7 @@ strsep(&oldsdp, "."); strsep(&oldsdp, "."); snprintf(newsd, GV_MAXSDNAME, "%s.%s", p->name, oldsdp); - err = gv_rename_sd(sc, req, s, newsd, flags); + err = gv_rename_sd(sc, s, newsd, flags); g_free(newsd); g_free(oldsd); if (err) @@ -238,8 +224,8 @@ * since there are no structures below a subdisk. Similarly, we don't have to * clean up any references elsewhere to the subdisk's name. */ -static int -gv_rename_sd(struct gv_softc *sc, struct gctl_req *req, struct gv_sd *s, char * newname, int flags) +int +gv_rename_sd(struct gv_softc *sc, struct gv_sd *s, char *newname, int flags) { char *new, *newp, *old, *oldp; int err; @@ -249,9 +235,9 @@ err = 0; - if (gv_object_type(sc, newname) != -1) { - gctl_error(req, "subdisk name %s already in use", newname); - return (-1); + if (gv_object_type(sc, newname) != GV_ERR_NOTFOUND) { + printf("VINUM: subdisk name %s already in use\n", newname); + return (GV_ERR_NAMETAKEN); } /* Needed for sanity checking. */ @@ -273,29 +259,28 @@ strsep(&oldp, "."); strsep(&newp, "."); if (newp == NULL || *newp == '\0') { - gctl_error(req, "proposed sd name '%s' is not a valid sd name", + printf("VINUM: proposed sd name '%s' is not a valid sd name\n", newname); - err = -1; + err = GV_ERR_INVNAME; goto fail; } strsep(&newp, "."); if (newp == NULL || *newp == '\0') { - gctl_error(req, "proposed sd name '%s' is not a valid sd name", + printf("VINUM: proposed sd name '%s' is not a valid sd name\n", newname); - err = -1; + err = GV_ERR_INVNAME; goto fail; } - if (strcmp(newp, oldp)) { - gctl_error(req, "current and proposed sd numbers (%s, %s) do " - "not match", oldp, newp); - err = -1; + /* XXX: Uhm, why is this important?. */ +/* if (strcmp(newp, oldp)) { + printf("VINUM: current and proposed sd numbers (%s, %s) do " + "not match\n", oldp, newp); + err = GV_ERR_INVNAME; goto fail; - } + }*/ strncpy(s->name, newname, GV_MAXSDNAME); - /* XXX: can we rename providers here? */ - fail: g_free(new); g_free(old); @@ -303,8 +288,9 @@ return (err); } -static int -gv_rename_vol(struct gv_softc *sc, struct gctl_req *req, struct gv_volume *v, char *newname, int flags) +int +gv_rename_vol(struct gv_softc *sc, struct gv_volume *v, char *newname, + int flags) { struct gv_plex *p; char *new, *old, *oldp; @@ -313,9 +299,9 @@ g_topology_assert(); KASSERT(v != NULL, ("gv_rename_vol: NULL v")); - if (gv_object_type(sc, newname) != -1) { - gctl_error(req, "volume name %s already in use", newname); - return (-1); + if (gv_object_type(sc, newname) != GV_ERR_NOTFOUND) { + printf("VINUM: volume name %s already in use", newname); + return (GV_ERR_NAMETAKEN); } /* Rename the volume. */ @@ -335,7 +321,7 @@ */ strsep(&oldp, "."); snprintf(new, GV_MAXPLEXNAME, "%s.%s", v->name, oldp); - err = gv_rename_plex(sc, req, p, new, flags); + err = gv_rename_plex(sc, p, new, flags); g_free(new); g_free(old); if (err) Index: sys/geom/vinum/geom_vinum_rm.c =================================================================== RCS file: /srv/ncvs/src/sys/geom/vinum/geom_vinum_rm.c,v retrieving revision 1.13 diff -u -u -r1.13 geom_vinum_rm.c --- sys/geom/vinum/geom_vinum_rm.c 12 Apr 2007 17:54:35 -0000 1.13 +++ sys/geom/vinum/geom_vinum_rm.c 3 Nov 2007 02:40:17 -0000 @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2004 Lukas Ertl + * Copyright (c) 2004, 2007 Lukas Ertl * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -30,20 +30,11 @@ #include #include -#include #include #include #include #include -#include - -static int gv_rm_drive(struct gv_softc *, struct gctl_req *, - struct gv_drive *, int); -static int gv_rm_plex(struct gv_softc *, struct gctl_req *, - struct gv_plex *, int); -static int gv_rm_vol(struct gv_softc *, struct gctl_req *, - struct gv_volume *, int); /* General 'remove' routine. */ void @@ -56,7 +47,7 @@ struct gv_drive *d; int *argc, *flags; char *argv, buf[20]; - int i, type, err; + int i, type; argc = gctl_get_paraml(req, "argc", sizeof(*argc)); flags = gctl_get_paraml(req, "flags", sizeof(*flags)); @@ -68,6 +59,8 @@ sc = gp->softc; + /* XXX config locking */ + for (i = 0; i < *argc; i++) { snprintf(buf, sizeof(buf), "argv%d", i); argv = gctl_get_param(req, buf, NULL); @@ -77,184 +70,176 @@ switch (type) { case GV_TYPE_VOL: v = gv_find_vol(sc, argv); - if (v == NULL) { - gctl_error(req, "unknown volume '%s'", argv); + + /* + * If this volume has plexes, we want a recursive + * removal. + */ + if (!LIST_EMPTY(&v->plexes) && !(*flags & GV_FLAG_R)) { + gctl_error(req, "volume '%s' has attached " + "plexes - need recursive removal", v->name); return; } - err = gv_rm_vol(sc, req, v, *flags); - if (err) - return; + + gv_post_event(sc, GV_EVENT_RM_VOLUME, v, NULL, 0, 0); break; + case GV_TYPE_PLEX: p = gv_find_plex(sc, argv); - if (p == NULL) { - gctl_error(req, "unknown plex '%s'", argv); + + /* + * If this plex has subdisks, we want a recursive + * removal. + */ + if (!LIST_EMPTY(&p->subdisks) && + !(*flags & GV_FLAG_R)) { + gctl_error(req, "plex '%s' has attached " + "subdisks - need recursive removal", + p->name); return; } - err = gv_rm_plex(sc, req, p, *flags); - if (err) + + /* Don't allow removal of the only plex of a volume. */ + if (p->vol_sc != NULL && p->vol_sc->plexcount == 1) { + gctl_error(req, "plex '%s' is still attached " + "to volume '%s'", p->name, p->volume); return; + } + + gv_post_event(sc, GV_EVENT_RM_PLEX, p, NULL, 0, 0); break; + case GV_TYPE_SD: s = gv_find_sd(sc, argv); - if (s == NULL) { - gctl_error(req, "unknown subdisk '%s'", argv); + + /* Don't allow removal if attached to a plex. */ + if (s->plex_sc != NULL) { + gctl_error(req, "subdisk '%s' is still attached" + " to plex '%s'", s->name, s->plex_sc->name); return; } - err = gv_rm_sd(sc, req, s, *flags); - if (err) - return; + + gv_post_event(sc, GV_EVENT_RM_SD, s, NULL, 0, 0); break; + case GV_TYPE_DRIVE: d = gv_find_drive(sc, argv); - if (d == NULL) { - gctl_error(req, "unknown drive '%s'", argv); + /* We don't allow to remove open drives. */ + if (gv_consumer_is_open(d->consumer)) { + gctl_error(req, "drive '%s' is open", d->name); return; } - err = gv_rm_drive(sc, req, d, *flags); - if (err) + + /* A drive with subdisks needs a recursive removal. */ +/* if (!LIST_EMPTY(&d->subdisks) && + !(*flags & GV_FLAG_R)) { + gctl_error(req, "drive '%s' still has subdisks" + " - need recursive removal", d->name); return; + }*/ + + gv_post_event(sc, GV_EVENT_RM_DRIVE, d, NULL, *flags, + 0); break; + default: gctl_error(req, "unknown object '%s'", argv); return; } } - gv_save_config_all(sc); + gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); } /* Resets configuration */ int -gv_resetconfig(struct g_geom *gp, struct gctl_req *req) +gv_resetconfig(struct gv_softc *sc) { - struct gv_softc *sc; struct gv_drive *d, *d2; struct gv_volume *v, *v2; struct gv_plex *p, *p2; struct gv_sd *s, *s2; - int flags; - d = NULL; - d2 = NULL; - p = NULL; - p2 = NULL; - s = NULL; - s2 = NULL; - flags = GV_FLAG_R; - sc = gp->softc; - /* First loop through to make sure no volumes are up */ - LIST_FOREACH_SAFE(v, &sc->volumes, volume, v2) { - if (gv_is_open(v->geom)) { - gctl_error(req, "volume '%s' is busy", v->name); - return (-1); + /* First make sure nothing is open. */ + LIST_FOREACH_SAFE(d, &sc->drives, drive, d2) { + if (gv_consumer_is_open(d->consumer)) { + return (GV_ERR_ISBUSY); } } /* Then if not, we remove everything. */ - LIST_FOREACH_SAFE(v, &sc->volumes, volume, v2) - gv_rm_vol(sc, req, v, flags); - LIST_FOREACH_SAFE(p, &sc->plexes, plex, p2) - gv_rm_plex(sc, req, p, flags); LIST_FOREACH_SAFE(s, &sc->subdisks, sd, s2) - gv_rm_sd(sc, req, s, flags); + gv_rm_sd(sc, s); LIST_FOREACH_SAFE(d, &sc->drives, drive, d2) - gv_rm_drive(sc, req, d, flags); - gv_save_config_all(sc); + gv_rm_drive(sc, d, 0); + LIST_FOREACH_SAFE(p, &sc->plexes, plex, p2) + gv_rm_plex(sc, p); + LIST_FOREACH_SAFE(v, &sc->volumes, volume, v2) + gv_rm_vol(sc, v); + + gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); + return (0); } /* Remove a volume. */ -static int -gv_rm_vol(struct gv_softc *sc, struct gctl_req *req, struct gv_volume *v, int flags) +void +gv_rm_vol(struct gv_softc *sc, struct gv_volume *v) { - struct g_geom *gp; + struct g_provider *pp; struct gv_plex *p, *p2; - int err; - g_topology_assert(); KASSERT(v != NULL, ("gv_rm_vol: NULL v")); - /* If this volume has plexes, we want a recursive removal. */ - if (!LIST_EMPTY(&v->plexes) && !(flags & GV_FLAG_R)) { - gctl_error(req, "volume '%s' has attached plexes", v->name); - return (-1); - } - - gp = v->geom; + /* XXX gp = v->geom; */ + pp = v->provider; /* Check if any of our consumers is open. */ +/* XXX if (gp != NULL && gv_is_open(gp)) { gctl_error(req, "volume '%s' is busy", v->name); return (-1); } +*/ /* Remove the plexes our volume has. */ - LIST_FOREACH_SAFE(p, &v->plexes, in_volume, p2) { - v->plexcount--; - LIST_REMOVE(p, in_volume); - p->vol_sc = NULL; + LIST_FOREACH_SAFE(p, &v->plexes, in_volume, p2) + gv_rm_plex(sc, p); - err = gv_rm_plex(sc, req, p, flags); - if (err) - return (err); - } - - /* Clean up and let our geom fade away. */ + /* Clean up. */ LIST_REMOVE(v, volume); - gv_kill_vol_thread(v); g_free(v); - if (gp != NULL) { - gp->softc = NULL; - g_wither_geom(gp, ENXIO); - } - return (0); + /* Get rid of the volume's provider. */ + if (pp != NULL) { + g_topology_lock(); + pp->flags |= G_PF_WITHER; + g_orphan_provider(pp, ENXIO); + g_topology_unlock(); + } } /* Remove a plex. */ -static int -gv_rm_plex(struct gv_softc *sc, struct gctl_req *req, struct gv_plex *p, int flags) +void +gv_rm_plex(struct gv_softc *sc, struct gv_plex *p) { - struct g_geom *gp; struct gv_volume *v; struct gv_sd *s, *s2; - int err; - - g_topology_assert(); KASSERT(p != NULL, ("gv_rm_plex: NULL p")); - /* If this plex has subdisks, we want a recursive removal. */ - if (!LIST_EMPTY(&p->subdisks) && !(flags & GV_FLAG_R)) { - gctl_error(req, "plex '%s' has attached subdisks", p->name); - return (-1); - } - - if (p->vol_sc != NULL && p->vol_sc->plexcount == 1) { - gctl_error(req, "plex '%s' is still attached to volume '%s'", - p->name, p->volume); - return (-1); - } - - gp = p->geom; + /* XXX gp = p->geom; */ /* Check if any of our consumers is open. */ +/* XXX if (gp != NULL && gv_is_open(gp)) { gctl_error(req, "plex '%s' is busy", p->name); return (-1); } +*/ /* Remove the subdisks our plex has. */ - LIST_FOREACH_SAFE(s, &p->subdisks, in_plex, s2) { -#if 0 - LIST_REMOVE(s, in_plex); - s->plex_sc = NULL; -#endif - - err = gv_rm_sd(sc, req, s, flags); - if (err) - return (err); - } + LIST_FOREACH_SAFE(s, &p->subdisks, in_plex, s2) + gv_rm_sd(sc, s); v = p->vol_sc; /* Clean up and let our geom fade away. */ @@ -267,35 +252,25 @@ gv_update_vol_size(v, gv_vol_size(v)); } - gv_kill_plex_thread(p); g_free(p); - - if (gp != NULL) { - gp->softc = NULL; - g_wither_geom(gp, ENXIO); - } - - return (0); } /* Remove a subdisk. */ -int -gv_rm_sd(struct gv_softc *sc, struct gctl_req *req, struct gv_sd *s, int flags) +void +gv_rm_sd(struct gv_softc *sc, struct gv_sd *s) { - struct g_provider *pp; struct gv_plex *p; struct gv_volume *v; KASSERT(s != NULL, ("gv_rm_sd: NULL s")); - pp = s->provider; p = s->plex_sc; v = NULL; /* Clean up. */ if (p != NULL) { LIST_REMOVE(s, in_plex); - + s->plex_sc = NULL; p->sdcount--; /* Update the plexsize. */ p->size = gv_plex_size(p); @@ -305,77 +280,65 @@ gv_update_vol_size(v, gv_vol_size(v)); } } - if (s->drive_sc) + if (s->drive_sc && !(s->drive_sc->flags & GV_DRIVE_REFERENCED)) LIST_REMOVE(s, from_drive); LIST_REMOVE(s, sd); gv_free_sd(s); g_free(s); - - /* If the subdisk has a provider we need to clean up this one too. */ - if (pp != NULL) { - pp->flags |= G_PF_WITHER; - g_orphan_provider(pp, ENXIO); - } - - return (0); } /* Remove a drive. */ -static int -gv_rm_drive(struct gv_softc *sc, struct gctl_req *req, struct gv_drive *d, int flags) +void +gv_rm_drive(struct gv_softc *sc, struct gv_drive *d, int flags) { - struct g_geom *gp; struct g_consumer *cp; struct gv_freelist *fl, *fl2; struct gv_plex *p; struct gv_sd *s, *s2; struct gv_volume *v; + struct gv_drive *d2; int err; KASSERT(d != NULL, ("gv_rm_drive: NULL d")); - gp = d->geom; - KASSERT(gp != NULL, ("gv_rm_drive: NULL gp")); - /* We don't allow to remove open drives. */ - if (gv_is_open(gp)) { - gctl_error(req, "drive '%s' is open", d->name); - return (-1); - } + cp = d->consumer; - /* A drive with subdisks needs a recursive removal. */ - if (!LIST_EMPTY(&d->subdisks) && !(flags & GV_FLAG_R)) { - gctl_error(req, "drive '%s' still has subdisks", d->name); - return (-1); - } + if (cp != NULL) { + g_topology_lock(); + err = g_access(cp, 0, 1, 0); + g_topology_unlock(); + + if (err) { + printf("VINUM: gv_rm_drive: couldn't access '%s', " + "errno: %d\n", cp->provider->name, err); + return; + } - cp = LIST_FIRST(&gp->consumer); - err = g_access(cp, 0, 1, 0); - if (err) { - printf("GEOM_VINUM: gv_rm_drive: couldn't access '%s', errno: " - "%d\n", cp->provider->name, err); - return (err); - } - - /* Clear the Vinum Magic. */ - d->hdr->magic = GV_NOMAGIC; - g_topology_unlock(); - err = g_write_data(cp, GV_HDR_OFFSET, d->hdr, GV_HDR_LEN); - if (err) { - printf("GEOM_VINUM: gv_rm_drive: couldn't write header to '%s'" - ", errno: %d\n", cp->provider->name, err); - d->hdr->magic = GV_MAGIC; + /* Clear the Vinum Magic. */ + d->hdr->magic = GV_NOMAGIC; + err = g_write_data(cp, GV_HDR_OFFSET, d->hdr, GV_HDR_LEN); + if (err) + printf("VINUM: gv_rm_drive: couldn't write header to " + "'%s', errno: %d\n", cp->provider->name, err); + + g_topology_lock(); + g_access(cp, -cp->acr, -cp->acw, -cp->ace); + g_detach(cp); + g_destroy_consumer(cp); + g_topology_unlock(); } - g_topology_lock(); - g_access(cp, 0, -1, 0); /* Remove all associated subdisks, plexes, volumes. */ - if (!LIST_EMPTY(&d->subdisks)) { - LIST_FOREACH_SAFE(s, &d->subdisks, from_drive, s2) { - p = s->plex_sc; - if (p != NULL) { - v = p->vol_sc; - if (v != NULL) - gv_rm_vol(sc, req, v, flags); + /* XXX not quite correct. Perhaps now...? */ + if (flags & GV_FLAG_R) { + if (!LIST_EMPTY(&d->subdisks)) { + LIST_FOREACH_SAFE(s, &d->subdisks, from_drive, s2) { + p = s->plex_sc; + if (p != NULL) { + v = p->vol_sc; + if (v != NULL) + gv_rm_vol(sc, v); + } } } } @@ -385,15 +348,33 @@ LIST_REMOVE(fl, freelist); g_free(fl); } - LIST_REMOVE(d, drive); - gv_kill_drive_thread(d); - gp = d->geom; - d->geom = NULL; + LIST_REMOVE(d, drive); g_free(d->hdr); + + /* Put ourself into referenced state if we have subdisks. */ + if (d->sdcount > 0) { + d->consumer = NULL; + d->hdr = NULL; + d->flags |= GV_DRIVE_REFERENCED; + snprintf(d->device, GV_MAXDRIVENAME, "???"); + d->size = 0; + d->avail = 0; + d->freelist_entries = 0; + LIST_FOREACH(s, &d->subdisks, from_drive) { + s->flags |= GV_SD_TASTED; + gv_set_sd_state(s, GV_SD_DOWN, GV_SETSTATE_FORCE); + } + /* Shuffle around so we keep gv_is_newer happy. */ + LIST_REMOVE(d, drive); + d2 = LIST_FIRST(&sc->drives); + if (d2 == NULL) + LIST_INSERT_HEAD(&sc->drives, d, drive); + else + LIST_INSERT_AFTER(d2, d, drive); + return; + } g_free(d); - gv_save_config_all(sc); - g_wither_geom(gp, ENXIO); - return (err); + gv_save_config(sc); } Index: sys/geom/vinum/geom_vinum_share.c =================================================================== RCS file: /srv/ncvs/src/sys/geom/vinum/geom_vinum_share.c,v retrieving revision 1.5 diff -u -u -r1.5 geom_vinum_share.c --- sys/geom/vinum/geom_vinum_share.c 12 Apr 2007 17:40:44 -0000 1.5 +++ sys/geom/vinum/geom_vinum_share.c 3 Nov 2007 02:40:17 -0000 @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2004 Lukas Ertl + * Copyright (c) 2004, 2007 Lukas Ertl * Copyright (c) 1997, 1998, 1999 * Nan Yang Computer Services Limited. All rights reserved. * @@ -45,10 +45,6 @@ #include #ifdef _KERNEL -#include -#include -#include -#include #include #include @@ -63,7 +59,6 @@ #define g_free free #endif /* _KERNEL */ -#include #include #include @@ -237,6 +232,8 @@ return (GV_SD_UP); else if (!strcmp(buf, "reviving")) return (GV_SD_REVIVING); + else if (!strcmp(buf, "initializing")) + return (GV_SD_INITIALIZING); else if (!strcmp(buf, "stale")) return (GV_SD_STALE); else @@ -273,6 +270,8 @@ return (GV_PLEX_INITIALIZING); else if (!strcmp(buf, "degraded")) return (GV_PLEX_DEGRADED); + else if (!strcmp(buf, "growable")) + return (GV_PLEX_GROWABLE); else return (GV_PLEX_DOWN); } @@ -288,6 +287,8 @@ return "initializing"; case GV_PLEX_DEGRADED: return "degraded"; + case GV_PLEX_GROWABLE: + return "growable"; case GV_PLEX_UP: return "up"; default: @@ -378,14 +379,13 @@ return (NULL); #ifdef _KERNEL - d = g_malloc(sizeof(struct gv_drive), M_WAITOK | M_ZERO); - + d = g_malloc(sizeof(struct gv_drive), M_NOWAIT); #else d = malloc(sizeof(struct gv_drive)); +#endif if (d == NULL) return (NULL); bzero(d, sizeof(struct gv_drive)); -#endif errors = 0; for (j = 1; j < max; j++) { @@ -435,14 +435,13 @@ return (NULL); #ifdef _KERNEL - v = g_malloc(sizeof(struct gv_volume), M_WAITOK | M_ZERO); - + v = g_malloc(sizeof(struct gv_volume), M_NOWAIT); #else v = malloc(sizeof(struct gv_volume)); +#endif if (v == NULL) return (NULL); bzero(v, sizeof(struct gv_volume)); -#endif errors = 0; for (j = 1; j < max; j++) { @@ -481,13 +480,13 @@ return (NULL); #ifdef _KERNEL - p = g_malloc(sizeof(struct gv_plex), M_WAITOK | M_ZERO); + p = g_malloc(sizeof(struct gv_plex), M_NOWAIT); #else p = malloc(sizeof(struct gv_plex)); +#endif if (p == NULL) return (NULL); bzero(p, sizeof(struct gv_plex)); -#endif errors = 0; for (j = 1; j < max; j++) { @@ -554,16 +553,16 @@ int j, errors; if (token[1] == NULL || *token[1] == '\0') - return NULL; + return (NULL); #ifdef _KERNEL - s = g_malloc(sizeof(struct gv_sd), M_WAITOK | M_ZERO); + s = g_malloc(sizeof(struct gv_sd), M_NOWAIT); #else s = malloc(sizeof(struct gv_sd)); +#endif if (s == NULL) - return NULL; + return (NULL); bzero(s, sizeof(struct gv_sd)); -#endif s->plex_offset = -1; s->size = -1; Index: sys/geom/vinum/geom_vinum_share.h =================================================================== RCS file: /srv/ncvs/src/sys/geom/vinum/geom_vinum_share.h,v retrieving revision 1.2 diff -u -u -r1.2 geom_vinum_share.h --- sys/geom/vinum/geom_vinum_share.h 15 Nov 2004 12:30:59 -0000 1.2 +++ sys/geom/vinum/geom_vinum_share.h 3 Nov 2007 02:40:17 -0000 @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2004 Lukas Ertl + * Copyright (c) 2004, 2007 Lukas Ertl * All rights reserved. * * Redistribution and use in source and binary forms, with or without Index: sys/geom/vinum/geom_vinum_state.c =================================================================== RCS file: /srv/ncvs/src/sys/geom/vinum/geom_vinum_state.c,v retrieving revision 1.8 diff -u -u -r1.8 geom_vinum_state.c --- sys/geom/vinum/geom_vinum_state.c 30 Mar 2006 14:01:25 -0000 1.8 +++ sys/geom/vinum/geom_vinum_state.c 3 Nov 2007 02:40:17 -0000 @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2004 Lukas Ertl + * Copyright (c) 2004, 2007 Lukas Ertl * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -27,8 +27,6 @@ #include __FBSDID("$FreeBSD: src/sys/geom/vinum/geom_vinum_state.c,v 1.8 2006/03/30 14:01:25 le Exp $"); -#include -#include #include #include @@ -43,8 +41,10 @@ struct gv_softc *sc; struct gv_sd *s; struct gv_drive *d; + struct gv_volume *v; + struct gv_plex *p; char *obj, *state; - int err, f, *flags, newstate, type; + int f, *flags, type; f = 0; obj = gctl_get_param(req, "object", NULL); @@ -72,43 +72,52 @@ type = gv_object_type(sc, obj); switch (type) { case GV_TYPE_VOL: + if (gv_volstatei(state) < 0) { + gctl_error(req, "invalid volume state '%s'", state); + break; + } + v = gv_find_vol(sc, obj); + gv_post_event(sc, GV_EVENT_SET_VOL_STATE, v, NULL, + gv_volstatei(state), f); + break; + case GV_TYPE_PLEX: - gctl_error(req, "volume or plex state cannot be set currently"); + if (gv_plexstatei(state) < 0) { + gctl_error(req, "invalid plex state '%s'", state); + break; + } + p = gv_find_plex(sc, obj); + gv_post_event(sc, GV_EVENT_SET_PLEX_STATE, p, NULL, + gv_plexstatei(state), f); break; case GV_TYPE_SD: - newstate = gv_sdstatei(state); - if (newstate < 0) { + if (gv_sdstatei(state) < 0) { gctl_error(req, "invalid subdisk state '%s'", state); break; } s = gv_find_sd(sc, obj); - err = gv_set_sd_state(s, newstate, f); - if (err) - gctl_error(req, "cannot set subdisk state"); + gv_post_event(sc, GV_EVENT_SET_SD_STATE, s, NULL, + gv_sdstatei(state), f); break; case GV_TYPE_DRIVE: - newstate = gv_drivestatei(state); - if (newstate < 0) { + if (gv_drivestatei(state) < 0) { gctl_error(req, "invalid drive state '%s'", state); break; } d = gv_find_drive(sc, obj); - err = gv_set_drive_state(d, newstate, f); - if (err) - gctl_error(req, "cannot set drive state"); + gv_post_event(sc, GV_EVENT_SET_DRIVE_STATE, d, NULL, + gv_drivestatei(state), f); break; default: gctl_error(req, "unknown object '%s'", obj); break; } - - return; } -/* Update drive state; return 0 if the state changes, otherwise -1. */ +/* Update drive state; return 0 if the state changes, otherwise error. */ int gv_set_drive_state(struct gv_drive *d, int newstate, int flags) { @@ -123,9 +132,9 @@ return (0); /* We allow to take down an open drive only with force. */ - if ((newstate == GV_DRIVE_DOWN) && gv_is_open(d->geom) && + if ((newstate == GV_DRIVE_DOWN) && gv_consumer_is_open(d->consumer) && (!(flags & GV_SETSTATE_FORCE))) - return (-1); + return (GV_ERR_ISBUSY); d->state = newstate; @@ -136,7 +145,7 @@ /* Save the config back to disk. */ if (flags & GV_SETSTATE_CONFIG) - gv_save_config_all(d->vinumconf); + gv_save_config(d->vinumconf); return (0); } @@ -165,14 +174,24 @@ * force. */ if ((s->plex_sc != NULL) && !(flags & GV_SETSTATE_FORCE)) - return (-1); + return (GV_ERR_ISATTACHED); + break; + + case GV_SD_REVIVING: + case GV_SD_INITIALIZING: + /* + * Only do this if we're forced, since it usually is done + * internally, and then we do use the force flag. + */ + if (!flags & GV_SETSTATE_FORCE) + return (GV_ERR_SETSTATE); break; case GV_SD_UP: /* We can't bring the subdisk up if our drive is dead. */ d = s->drive_sc; if ((d == NULL) || (d->state != GV_DRIVE_UP)) - return (-1); + return (GV_ERR_SETSTATE); /* Check from where we want to be brought up. */ switch (s->state) { @@ -201,12 +220,15 @@ if (p->org != GV_PLEX_RAID5) break; - else if (flags & GV_SETSTATE_FORCE) + else if (s->flags & GV_SD_CANGOUP) { + s->flags &= ~GV_SD_CANGOUP; + break; + } else if (flags & GV_SETSTATE_FORCE) break; else s->state = GV_SD_STALE; - status = -1; + status = GV_ERR_SETSTATE; break; case GV_SD_STALE: @@ -225,17 +247,17 @@ (p->vol_sc->plexcount == 1)) break; else - return (-1); + return (GV_ERR_SETSTATE); default: - return (-1); + return (GV_ERR_INVSTATE); } break; /* Other state transitions are only possible with force. */ default: if (!(flags & GV_SETSTATE_FORCE)) - return (-1); + return (GV_ERR_SETSTATE); } /* We can change the state and do it. */ @@ -248,11 +270,102 @@ /* Save the config back to disk. */ if (flags & GV_SETSTATE_CONFIG) - gv_save_config_all(s->vinumconf); + gv_save_config(s->vinumconf); return (status); } +int +gv_set_plex_state(struct gv_plex *p, int newstate, int flags) +{ + struct gv_volume *v; + int oldstate, plexdown; + + KASSERT(p != NULL, ("gv_set_plex_state: NULL p")); + + oldstate = p->state; + v = p->vol_sc; + plexdown = 0; + + if (newstate == oldstate) + return (0); + + switch (newstate) { + case GV_PLEX_UP: + /* Let update_plex handle if the plex can come up */ + gv_update_plex_state(p); + if (p->state != GV_PLEX_UP && !(flags & GV_SETSTATE_FORCE)) + return (GV_ERR_SETSTATE); + p->state = newstate; + break; + case GV_PLEX_DOWN: + /* + * Set state to GV_PLEX_DOWN only if no-one is using the plex, + * or if the state is forced. + */ + if (v != NULL) { + /* If the only one up, force is needed. */ + plexdown = gv_plexdown(v); + if ((v->plexcount == 1 || + (v->plexcount - plexdown == 1)) && + ((flags & GV_SETSTATE_FORCE) == 0)) + return (GV_ERR_SETSTATE); + } + p->state = newstate; + break; + case GV_PLEX_DEGRADED: + /* Only used internally, so we have to be forced. */ + if (flags & GV_SETSTATE_FORCE) + p->state = newstate; + break; + } + + /* Update our volume if we have one. */ + if (v != NULL) + gv_update_vol_state(v); + + /* Save config. */ + if (flags & GV_SETSTATE_CONFIG) + gv_save_config(p->vinumconf); + return (0); +} + +int +gv_set_vol_state(struct gv_volume *v, int newstate, int flags) +{ + int oldstate; + + KASSERT(v != NULL, ("gv_set_vol_state: NULL v")); + + oldstate = v->state; + + if (newstate == oldstate) + return (0); + + switch (newstate) { + case GV_VOL_UP: + /* Let update handle if the volume can come up. */ + gv_update_vol_state(v); + if (v->state != GV_VOL_UP && !(flags & GV_SETSTATE_FORCE)) + return (GV_ERR_SETSTATE); + v->state = newstate; + break; + case GV_VOL_DOWN: + /* + * Set state to GV_VOL_DOWN only if no-one is using the volume, + * or if the state should be forced. + */ + if (!gv_provider_is_open(v->provider) && + !(flags & GV_SETSTATE_FORCE)) + return (GV_ERR_ISBUSY); + v->state = newstate; + break; + } + /* Save config */ + if (flags & GV_SETSTATE_CONFIG) + gv_save_config(v->vinumconf); + return (0); +} /* Update the state of a subdisk based on its environment. */ void @@ -268,19 +381,23 @@ oldstate = s->state; /* If our drive isn't up we cannot be up either. */ - if (d->state != GV_DRIVE_UP) + if (d->state != GV_DRIVE_UP) { s->state = GV_SD_DOWN; /* If this subdisk was just created, we assume it is good.*/ - else if (s->flags & GV_SD_NEWBORN) { + } else if (s->flags & GV_SD_NEWBORN) { s->state = GV_SD_UP; s->flags &= ~GV_SD_NEWBORN; - } else if (s->state != GV_SD_UP) - s->state = GV_SD_STALE; - else + } else if (s->state != GV_SD_UP) { + if (s->flags & GV_SD_CANGOUP) { + s->state = GV_SD_UP; + s->flags &= ~GV_SD_CANGOUP; + } else + s->state = GV_SD_STALE; + } else s->state = GV_SD_UP; if (s->state != oldstate) - printf("GEOM_VINUM: subdisk %s state change: %s -> %s\n", + printf("VINUM: subdisk %s state change: %s -> %s\n", s->name, gv_sdstate(oldstate), gv_sdstate(s->state)); /* Update the plex, if we have one. */ @@ -292,6 +409,7 @@ void gv_update_plex_state(struct gv_plex *p) { + struct gv_sd *s; int sdstates; int oldstate; @@ -316,15 +434,26 @@ /* Some of our subdisks are initializing. */ } else if (sdstates & GV_SD_INITSTATE) { - if (p->flags & GV_PLEX_SYNCING) + + if (p->flags & GV_PLEX_SYNCING || + p->flags & GV_PLEX_REBUILDING) p->state = GV_PLEX_DEGRADED; else p->state = GV_PLEX_DOWN; } else p->state = GV_PLEX_DOWN; + if (p->state == GV_PLEX_UP) { + LIST_FOREACH(s, &p->subdisks, in_plex) { + if (s->flags & GV_SD_GROW) { + p->state = GV_PLEX_GROWABLE; + break; + } + } + } + if (p->state != oldstate) - printf("GEOM_VINUM: plex %s state change: %s -> %s\n", p->name, + printf("VINUM: plex %s state change: %s -> %s\n", p->name, gv_plexstate(oldstate), gv_plexstate(p->state)); /* Update our volume, if we have one. */ Index: sys/geom/vinum/geom_vinum_subr.c =================================================================== RCS file: /srv/ncvs/src/sys/geom/vinum/geom_vinum_subr.c,v retrieving revision 1.16 diff -u -u -r1.16 geom_vinum_subr.c --- sys/geom/vinum/geom_vinum_subr.c 12 Apr 2007 17:54:35 -0000 1.16 +++ sys/geom/vinum/geom_vinum_subr.c 3 Nov 2007 02:40:17 -0000 @@ -1,5 +1,6 @@ /*- - * Copyright (c) 2004 Lukas Ertl + * Copyright (c) 2004, 2007 Lukas Ertl + * Copyright (c) 2007 Ulf Lilleengen * Copyright (c) 1997, 1998, 1999 * Nan Yang Computer Services Limited. All rights reserved. * @@ -42,59 +43,28 @@ __FBSDID("$FreeBSD: src/sys/geom/vinum/geom_vinum_subr.c,v 1.16 2007/04/12 17:54:35 le Exp $"); #include -#include -#include -#include #include #include #include -#include #include #include #include -static off_t gv_plex_smallest_sd(struct gv_plex *, off_t); +int gv_drive_is_newer(struct gv_softc *, struct gv_drive *); +static off_t gv_plex_smallest_sd(struct gv_plex *); -/* Find the VINUM class and it's associated geom. */ -struct g_geom * -find_vinum_geom(void) -{ - struct g_class *mp; - struct g_geom *gp; - - g_topology_assert(); - - gp = NULL; - - LIST_FOREACH(mp, &g_classes, class) { - if (!strcmp(mp->name, "VINUM")) { - gp = LIST_FIRST(&mp->geom); - break; - } - } - - return (gp); -} - -/* - * Parse the vinum config provided in *buf and store it in *gp's softc. - * If parameter 'merge' is non-zero, then the given config is merged into - * *gp. - */ void -gv_parse_config(struct gv_softc *sc, u_char *buf, int merge) +gv_parse_config(struct gv_softc *sc, char *buf, struct gv_drive *d) { char *aptr, *bptr, *cptr; struct gv_volume *v, *v2; struct gv_plex *p, *p2; struct gv_sd *s, *s2; - int tokens; + int error, is_newer, tokens; char *token[GV_MAXARGS]; - g_topology_assert(); - - KASSERT(sc != NULL, ("gv_parse_config: NULL softc")); + is_newer = gv_drive_is_newer(sc, d); /* Until the end of the string *buf. */ for (aptr = buf; *aptr != '\0'; aptr = bptr) { @@ -109,64 +79,96 @@ tokens = gv_tokenize(cptr, token, GV_MAXARGS); - if (tokens > 0) { - if (!strcmp(token[0], "volume")) { - v = gv_new_volume(tokens, token); - if (v == NULL) { - printf("geom_vinum: failed volume\n"); - break; - } + if (tokens <= 0) + continue; - if (merge) { - v2 = gv_find_vol(sc, v->name); - if (v2 != NULL) { - g_free(v); - continue; - } - } + if (!strcmp(token[0], "volume")) { + v = gv_new_volume(tokens, token); + if (v == NULL) { + printf("VINUM: config parse failed volume\n"); + break; + } - v->vinumconf = sc; - LIST_INIT(&v->plexes); - LIST_INSERT_HEAD(&sc->volumes, v, volume); - - } else if (!strcmp(token[0], "plex")) { - p = gv_new_plex(tokens, token); - if (p == NULL) { - printf("geom_vinum: failed plex\n"); - break; + v2 = gv_find_vol(sc, v->name); + if (v2 != NULL) { + /* XXX */ + if (is_newer) { + v2->state = v->state; + printf("VINUM: newer volume found!\n"); } + g_free(v); + continue; + } - if (merge) { - p2 = gv_find_plex(sc, p->name); - if (p2 != NULL) { - g_free(p); - continue; - } + gv_create_volume(sc, v); + + } else if (!strcmp(token[0], "plex")) { + p = gv_new_plex(tokens, token); + if (p == NULL) { + printf("VINUM: config parse failed plex\n"); + break; + } + + p2 = gv_find_plex(sc, p->name); + if (p2 != NULL) { + /* XXX */ + if (is_newer) { + p2->state = p->state; + printf("VINUM: newer plex found!\n"); } + g_free(p); + continue; + } - p->vinumconf = sc; - LIST_INIT(&p->subdisks); - LIST_INSERT_HEAD(&sc->plexes, p, plex); + error = gv_create_plex(sc, p); + if (error) + continue; + /* + * These flags were set in gv_create_plex() and are not + * needed here (on-disk config parsing). + */ + p->flags &= ~GV_PLEX_ADDED; + p->flags &= ~GV_PLEX_NEWBORN; - } else if (!strcmp(token[0], "sd")) { - s = gv_new_sd(tokens, token); + } else if (!strcmp(token[0], "sd")) { + s = gv_new_sd(tokens, token); - if (s == NULL) { - printf("geom_vinum: failed subdisk\n"); - break; - } + if (s == NULL) { + printf("VINUM: config parse failed subdisk\n"); + break; + } - if (merge) { - s2 = gv_find_sd(sc, s->name); - if (s2 != NULL) { - g_free(s); - continue; - } + s2 = gv_find_sd(sc, s->name); + if (s2 != NULL) { + /* XXX */ + if (is_newer) { + s2->state = s->state; + printf("VINUM: newer subdisk found!\n"); } - - s->vinumconf = sc; - LIST_INSERT_HEAD(&sc->subdisks, s, sd); + g_free(s); + continue; } + + /* + * Signal that this subdisk was tasted, and could + * possibly reference a drive that isn't in our config + * yet. + */ + s->flags |= GV_SD_TASTED; + + if (s->state == GV_SD_UP) + s->flags |= GV_SD_CANGOUP; + + error = gv_create_sd(sc, s); + if (error) + continue; + + /* + * This flag was set in gv_create_sd() and is not + * needed here (on-disk config parsing). + */ + s->flags &= ~GV_SD_NEWBORN; + s->flags &= ~GV_SD_GROW; } } } @@ -183,8 +185,6 @@ struct gv_plex *p; struct gv_volume *v; - g_topology_assert(); - /* * We don't need the drive configuration if we're not writing the * config to disk. @@ -233,17 +233,20 @@ sbuf_printf(sb, " state %s", gv_sdstate(s->state)); sbuf_printf(sb, "\n"); } - - return; } static off_t -gv_plex_smallest_sd(struct gv_plex *p, off_t smallest) +gv_plex_smallest_sd(struct gv_plex *p) { struct gv_sd *s; + off_t smallest; KASSERT(p != NULL, ("gv_plex_smallest_sd: NULL p")); + s = LIST_FIRST(&p->subdisks); + if (s == NULL) + return (-1); + smallest = s->size; LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->size < smallest) smallest = s->size; @@ -251,12 +254,29 @@ return (smallest); } +/* Walk over plexes in a volume and count how many are down. */ int -gv_sd_to_plex(struct gv_plex *p, struct gv_sd *s, int check) +gv_plexdown(struct gv_volume *v) { - struct gv_sd *s2; + int plexdown; + struct gv_plex *p; - g_topology_assert(); + KASSERT(v != NULL, ("gv_plexdown: NULL v")); + + plexdown = 0; + + LIST_FOREACH(p, &v->plexes, plex) { + if (p->state == GV_PLEX_DOWN) + plexdown++; + } + return (plexdown); +} + +int +gv_sd_to_plex(struct gv_sd *s, struct gv_plex *p) +{ + struct gv_sd *s2; + off_t psizeorig, remainder, smallest; /* If this subdisk was already given to this plex, do nothing. */ if (s->plex_sc == p) @@ -264,15 +284,56 @@ /* Check correct size of this subdisk. */ s2 = LIST_FIRST(&p->subdisks); - if (s2 != NULL && gv_is_striped(p) && (s2->size != s->size)) { - printf("GEOM_VINUM: need equal sized subdisks for " - "this plex organisation - %s (%jd) <-> %s (%jd)\n", - s2->name, s2->size, s->name, s->size); - return (-1); + /* Adjust the subdisk-size if necessary. */ + if (s2 != NULL && gv_is_striped(p)) { + /* First adjust to the stripesize. */ + remainder = s->size % p->stripesize; + + if (remainder) { + printf("VINUM: size of sd %s is not a " + "multiple of plex stripesize, taking off " + "%jd bytes\n", s->name, + (intmax_t)remainder); + gv_adjust_freespace(s, remainder); + } + + smallest = gv_plex_smallest_sd(p); + /* Then take off extra if other subdisks are smaller. */ + remainder = s->size - smallest; + + /* + * Don't allow a remainder below zero for running plexes, it's too + * painful, and if someone were to accidentally do this, the + * resulting array might be smaller than the original... not god + */ + if (remainder < 0) { + if (!(p->flags & GV_PLEX_NEWBORN)) { + printf("VINUM: sd %s too small for plex %s!\n", + s->name, p->name); + return (GV_ERR_BADSIZE); + } + /* Adjust other subdisks. */ + LIST_FOREACH(s2, &p->subdisks, in_plex) { + printf("VINUM: size of sd %s is to big, " + "taking off %jd bytes\n", s->name, + (intmax_t)remainder); + gv_adjust_freespace(s2, (remainder * -1)); + } + } else if (remainder > 0) { + printf("VINUM: size of sd %s is to big, " + "taking off %jd bytes\n", s->name, + (intmax_t)remainder); + gv_adjust_freespace(s, remainder); + } } /* Find the correct plex offset for this subdisk, if needed. */ if (s->plex_offset == -1) { + /* + * First set it to 0 to catch the case where we had a detached + * subdisk that didn't get any good offset. + */ + s->plex_offset = 0; if (p->sdcount) { LIST_FOREACH(s2, &p->subdisks, in_plex) { if (gv_is_striped(p)) @@ -282,25 +343,7 @@ s->plex_offset = s2->plex_offset + s2->size; } - } else - s->plex_offset = 0; - } - - p->sdcount++; - - /* Adjust the size of our plex. */ - switch (p->org) { - case GV_PLEX_CONCAT: - case GV_PLEX_STRIPED: - p->size += s->size; - break; - - case GV_PLEX_RAID5: - p->size = (p->sdcount - 1) * gv_plex_smallest_sd(p, s->size); - break; - - default: - break; + } } /* There are no subdisks for this plex yet, just insert it. */ @@ -321,6 +364,29 @@ } s->plex_sc = p; + /* Adjust the size of our plex. We check if the plex misses a subdisk, + * so we don't make the plex smaller than it actually should be. + */ + psizeorig = p->size; + p->size = gv_plex_size(p); + /* Make sure the size is not changed. */ + if (p->sddetached > 0) { + if (p->size < psizeorig) { + p->size = psizeorig; + /* We make sure wee need another subdisk. */ + if (p->sddetached == 1) + p->sddetached++; + } + p->sddetached--; + } else { + if ((p->org == GV_PLEX_RAID5 || + p->org == GV_PLEX_STRIPED) && + !(p->flags & GV_PLEX_NEWBORN) && + p->state >= GV_PLEX_DEGRADED) { + s->flags |= GV_SD_GROW; + } + p->sdcount++; + } return (0); } @@ -328,21 +394,32 @@ void gv_update_vol_size(struct gv_volume *v, off_t size) { - struct g_geom *gp; - struct g_provider *pp; - if (v == NULL) return; + if (v->provider != NULL) { + g_topology_lock(); + v->provider->mediasize = size; + g_topology_unlock(); + } + v->size = size; +} - gp = v->geom; - if (gp == NULL) - return; +/* Return how many subdisks that constitute the original plex. */ +int +gv_sdcount(struct gv_plex *p, int growing) +{ + struct gv_sd *s; + int sdcount; - LIST_FOREACH(pp, &gp->provider, provider) { - pp->mediasize = size; + sdcount = p->sdcount; + if (growing) { + LIST_FOREACH(s, &p->subdisks, in_plex) { + if (s->flags & GV_SD_GROW) + sdcount--; + } } - v->size = size; + return (sdcount); } /* Calculates the plex size. */ @@ -351,6 +428,7 @@ { struct gv_sd *s; off_t size; + int sdcount; KASSERT(p != NULL, ("gv_plex_size: NULL p")); @@ -359,6 +437,7 @@ /* Adjust the size of our plex. */ size = 0; + sdcount = gv_sdcount(p, 1); switch (p->org) { case GV_PLEX_CONCAT: LIST_FOREACH(s, &p->subdisks, in_plex) @@ -366,11 +445,11 @@ break; case GV_PLEX_STRIPED: s = LIST_FIRST(&p->subdisks); - size = p->sdcount * s->size; + size = sdcount * s->size; break; case GV_PLEX_RAID5: s = LIST_FIRST(&p->subdisks); - size = (p->sdcount - 1) * s->size; + size = (sdcount - 1) * s->size; break; } @@ -413,7 +492,7 @@ /* The plex was added to an already running volume. */ if (p->flags & GV_PLEX_ADDED) - state = GV_PLEX_DOWN; + gv_set_plex_state(p, GV_PLEX_DOWN, GV_SETSTATE_FORCE); switch (p->org) { case GV_PLEX_STRIPED: @@ -430,7 +509,7 @@ if (required_sds) { if (p->sdcount < required_sds) { - state = GV_PLEX_DOWN; + gv_set_plex_state(p, GV_PLEX_DOWN, GV_SETSTATE_FORCE); } /* @@ -439,18 +518,19 @@ s = LIST_FIRST(&p->subdisks); LIST_FOREACH(s2, &p->subdisks, in_plex) { if (s->size != s2->size) { - printf("geom_vinum: subdisk size mismatch " + printf("VINUM: subdisk size mismatch " "%s (%jd) <> %s (%jd)\n", s->name, s->size, s2->name, s2->size); - state = GV_PLEX_DOWN; + gv_set_plex_state(p, GV_PLEX_DOWN, + GV_SETSTATE_FORCE); } } - /* Trim subdisk sizes so that they match the stripe size. */ LIST_FOREACH(s, &p->subdisks, in_plex) { + /* Trim subdisk sizes to match the stripe size. */ remainder = s->size % p->stripesize; if (remainder) { - printf("gvinum: size of sd %s is not a " + printf("VINUM: size of sd %s is not a " "multiple of plex stripesize, taking off " "%jd bytes\n", s->name, (intmax_t)remainder); @@ -459,40 +539,25 @@ } } - /* Adjust the size of our plex. */ - if (p->sdcount > 0) { - p->size = 0; - switch (p->org) { - case GV_PLEX_CONCAT: - LIST_FOREACH(s, &p->subdisks, in_plex) - p->size += s->size; - break; - - case GV_PLEX_STRIPED: - s = LIST_FIRST(&p->subdisks); - p->size = p->sdcount * s->size; - break; - - case GV_PLEX_RAID5: - s = LIST_FIRST(&p->subdisks); - p->size = (p->sdcount - 1) * s->size; - break; - - default: - break; - } - } - + p->size = gv_plex_size(p); if (p->sdcount == 0) - state = GV_PLEX_DOWN; + gv_set_plex_state(p, GV_PLEX_DOWN, GV_SETSTATE_FORCE); else if ((p->flags & GV_PLEX_ADDED) || ((p->org == GV_PLEX_RAID5) && (p->flags & GV_PLEX_NEWBORN))) { LIST_FOREACH(s, &p->subdisks, in_plex) - s->state = GV_SD_STALE; + gv_set_sd_state(s, GV_SD_STALE, GV_SETSTATE_FORCE); p->flags &= ~GV_PLEX_ADDED; - p->flags &= ~GV_PLEX_NEWBORN; - p->state = GV_PLEX_DOWN; + gv_set_plex_state(p, GV_PLEX_DOWN, GV_SETSTATE_FORCE); + } else if (p->state == GV_PLEX_UP) { + LIST_FOREACH(s, &p->subdisks, in_plex) { + if (s->flags & GV_SD_GROW) { + p->state = GV_PLEX_GROWABLE; + break; + } + } } + /* Our plex is grown up now. */ + p->flags &= ~GV_PLEX_NEWBORN; } /* @@ -500,76 +565,83 @@ * freelist. */ int -gv_sd_to_drive(struct gv_softc *sc, struct gv_drive *d, struct gv_sd *s, - char *errstr, int errlen) +gv_sd_to_drive(struct gv_sd *s, struct gv_drive *d) { struct gv_sd *s2; struct gv_freelist *fl, *fl2; off_t tmp; int i; - g_topology_assert(); - fl2 = NULL; - KASSERT(sc != NULL, ("gv_sd_to_drive: NULL softc")); - KASSERT(d != NULL, ("gv_sd_to_drive: NULL drive")); - KASSERT(s != NULL, ("gv_sd_to_drive: NULL subdisk")); - KASSERT(errstr != NULL, ("gv_sd_to_drive: NULL errstr")); - KASSERT(errlen >= ERRBUFSIZ, ("gv_sd_to_drive: short errlen (%d)", - errlen)); + /* Shortcut for "referenced" drives. */ + /* XXX - insert into d->subdisks? */ + if (d->flags & GV_DRIVE_REFERENCED) { + s->drive_sc = d; + return (0); + } /* Check if this subdisk was already given to this drive. */ - if (s->drive_sc == d) - return (0); + if (s->drive_sc != NULL) { + if (s->drive_sc == d) { + if (!(s->flags & GV_SD_TASTED)) { + return (0); + } + } else { + printf("VINUM: can't give sd '%s' to '%s' " + "(already on '%s')\n", s->name, d->name, + s->drive_sc->name); + return (GV_ERR_ISATTACHED); + } + } /* Preliminary checks. */ - if (s->size > d->avail || d->freelist_entries == 0) { - snprintf(errstr, errlen, "not enough space on '%s' for '%s'", - d->name, s->name); - return (-1); + if ((s->size > d->avail) || (d->freelist_entries == 0)) { + printf("VINUM: not enough space on '%s' for '%s'", d->name, + s->name); + return (GV_ERR_NOSPACE); } - /* No size given, autosize it. */ + /* If no size was given for this subdisk, try to auto-size it... */ if (s->size == -1) { /* Find the largest available slot. */ LIST_FOREACH(fl, &d->freelist, freelist) { - if (fl->size >= s->size) { - s->size = fl->size; - s->drive_offset = fl->offset; - fl2 = fl; - } + if (fl->size < s->size) + continue; + s->size = fl->size; + s->drive_offset = fl->offset; + fl2 = fl; } /* No good slot found? */ if (s->size == -1) { - snprintf(errstr, errlen, "couldn't autosize '%s' on " - "'%s'", s->name, d->name); - return (-1); + printf("VINUM: couldn't autosize '%s' on '%s'", + s->name, d->name); + return (GV_ERR_BADSIZE); } /* - * Check if we have a free slot that's large enough for the given size. + * ... or check if we have a free slot that's large enough for the + * given size. */ } else { i = 0; LIST_FOREACH(fl, &d->freelist, freelist) { - /* Yes, this subdisk fits. */ - if (fl->size >= s->size) { - i++; - /* Assign drive offset, if not given. */ - if (s->drive_offset == -1) - s->drive_offset = fl->offset; - fl2 = fl; - break; - } + if (fl->size < s->size) + continue; + /* Assign drive offset, if not given. */ + if (s->drive_offset == -1) + s->drive_offset = fl->offset; + fl2 = fl; + i++; + break; } /* Couldn't find a good free slot. */ if (i == 0) { - snprintf(errstr, errlen, "free slots to small for '%s' " - "on '%s'", s->name, d->name); - return (-1); + printf("VINUM: free slots to small for '%s' on '%s'", + s->name, d->name); + return (GV_ERR_NOSPACE); } } @@ -604,9 +676,9 @@ /* Couldn't find a good free slot. */ if (i == 0) { - snprintf(errstr, errlen, "given drive_offset for '%s' " - "won't fit on '%s'", s->name, d->name); - return (-1); + printf("VINUM: given drive_offset for '%s' won't fit " + "on '%s'", s->name, d->name); + return (GV_ERR_NOSPACE); } } @@ -617,49 +689,41 @@ /* First, adjust the freelist. */ LIST_FOREACH(fl, &d->freelist, freelist) { + /* Look for the free slot that we have found before. */ + if (fl != fl2) + continue; + + /* The subdisk starts at the beginning of the free slot. */ + if (fl->offset == s->drive_offset) { + fl->offset += s->size; + fl->size -= s->size; + + /* The subdisk uses the whole slot, so remove it. */ + if (fl->size == 0) { + d->freelist_entries--; + LIST_REMOVE(fl, freelist); + } + /* + * The subdisk does not start at the beginning of the free + * slot. + */ + } else { + tmp = fl->offset + fl->size; + fl->size = s->drive_offset - fl->offset; - /* This is the free slot that we have found before. */ - if (fl == fl2) { - - /* - * The subdisk starts at the beginning of the free - * slot. - */ - if (fl->offset == s->drive_offset) { - fl->offset += s->size; - fl->size -= s->size; - - /* - * The subdisk uses the whole slot, so remove - * it. - */ - if (fl->size == 0) { - d->freelist_entries--; - LIST_REMOVE(fl, freelist); - } /* - * The subdisk does not start at the beginning of the - * free slot. + * The subdisk didn't use the complete rest of the free + * slot, so we need to split it. */ - } else { - tmp = fl->offset + fl->size; - fl->size = s->drive_offset - fl->offset; - - /* - * The subdisk didn't use the complete rest of - * the free slot, so we need to split it. - */ - if (s->drive_offset + s->size != tmp) { - fl2 = g_malloc(sizeof(*fl2), - M_WAITOK | M_ZERO); - fl2->offset = s->drive_offset + s->size; - fl2->size = tmp - fl2->offset; - LIST_INSERT_AFTER(fl, fl2, freelist); - d->freelist_entries++; - } + if (s->drive_offset + s->size != tmp) { + fl2 = g_malloc(sizeof(*fl2), M_WAITOK | M_ZERO); + fl2->offset = s->drive_offset + s->size; + fl2->size = tmp - fl2->offset; + LIST_INSERT_AFTER(fl, fl2, freelist); + d->freelist_entries++; } - break; } + break; } /* @@ -685,6 +749,8 @@ d->sdcount++; d->avail -= s->size; + s->flags &= ~GV_SD_TASTED; + /* Link back from the subdisk to this drive. */ s->drive_sc = d; @@ -869,17 +935,64 @@ return (NULL); } +/* Find a drive given a device. */ +struct gv_drive * +gv_find_drive_device(struct gv_softc *sc, char *device) +{ + struct gv_drive *d; + + LIST_FOREACH(d, &sc->drives, drive) { + if(!strcmp(d->device, device)) + return (d); + } + + return (NULL); +} + /* Check if any consumer of the given geom is open. */ int -gv_is_open(struct g_geom *gp) +gv_consumer_is_open(struct g_consumer *cp) { - struct g_consumer *cp; + if (cp == NULL) + return (0); + + if (cp->acr || cp->acw || cp->ace) + return (1); - if (gp == NULL) + return (0); +} + +int +gv_provider_is_open(struct g_provider *pp) { + if (pp == NULL) return (0); - LIST_FOREACH(cp, &gp->consumer, consumer) { - if (cp->acr || cp->acw || cp->ace) + if (pp->acr || pp->acw || pp->ace) + return (1); + + return (0); +} + +/* + * Compare the modification dates of the drives. + * Return 1 if a > b, 0 otherwise. + */ +int +gv_drive_is_newer(struct gv_softc *sc, struct gv_drive *d) +{ + struct gv_drive *d2; + struct timeval *a, *b; + + KASSERT(!LIST_EMPTY(&sc->drives), + ("gv_is_drive_newer: empty drive list")); + + a = &d->hdr->label.last_update; + LIST_FOREACH(d2, &sc->drives, drive) { + if ((d == d2) || (d2->state != GV_DRIVE_UP) || + (d2->hdr == NULL)) + continue; + b = &d2->hdr->label.last_update; + if (timevalcmp(a, b, >)) return (1); } @@ -915,58 +1028,248 @@ return (GV_TYPE_DRIVE); } - return (-1); + return (GV_ERR_NOTFOUND); } void -gv_kill_drive_thread(struct gv_drive *d) +gv_setup_objects(struct gv_softc *sc) { - if (d->flags & GV_DRIVE_THREAD_ACTIVE) { - d->flags |= GV_DRIVE_THREAD_DIE; - wakeup(d); - while (!(d->flags & GV_DRIVE_THREAD_DEAD)) - tsleep(d, PRIBIO, "gv_die", hz); - d->flags &= ~GV_DRIVE_THREAD_ACTIVE; - d->flags &= ~GV_DRIVE_THREAD_DIE; - d->flags &= ~GV_DRIVE_THREAD_DEAD; - g_free(d->bqueue); - d->bqueue = NULL; - mtx_destroy(&d->bqueue_mtx); + struct g_provider *pp; + struct gv_volume *v; + struct gv_plex *p; + struct gv_sd *s; + struct gv_drive *d; + + LIST_FOREACH(s, &sc->subdisks, sd) { + d = gv_find_drive(sc, s->drive); + if (d != NULL) + gv_sd_to_drive(s, d); + p = gv_find_plex(sc, s->plex); + if (p != NULL) + gv_sd_to_plex(s, p); + gv_update_sd_state(s); + } + + LIST_FOREACH(p, &sc->plexes, plex) { + gv_update_plex_config(p); + v = gv_find_vol(sc, p->volume); + if (v != NULL && p->vol_sc != v) { + p->vol_sc = v; + v->plexcount++; + LIST_INSERT_HEAD(&v->plexes, p, in_volume); + } + gv_update_plex_config(p); + } + + LIST_FOREACH(v, &sc->volumes, volume) { + v->size = gv_vol_size(v); + if (v->provider == NULL) { + g_topology_lock(); + pp = g_new_providerf(sc->geom, "gvinum/%s", v->name); + pp->mediasize = v->size; + pp->sectorsize = 512; /* XXX */ + g_error_provider(pp, 0); + v->provider = pp; + pp->private = v; + g_topology_unlock(); + } else if (v->provider->mediasize != v->size) { + g_topology_lock(); + v->provider->mediasize = v->size; + g_topology_unlock(); + } + v->flags &= ~GV_VOL_NEWBORN; + gv_update_vol_state(v); } } void -gv_kill_plex_thread(struct gv_plex *p) +gv_cleanup(struct gv_softc *sc) { - if (p->flags & GV_PLEX_THREAD_ACTIVE) { - p->flags |= GV_PLEX_THREAD_DIE; - wakeup(p); - while (!(p->flags & GV_PLEX_THREAD_DEAD)) - tsleep(p, PRIBIO, "gv_die", hz); - p->flags &= ~GV_PLEX_THREAD_ACTIVE; - p->flags &= ~GV_PLEX_THREAD_DIE; - p->flags &= ~GV_PLEX_THREAD_DEAD; + struct gv_volume *v, *v2; + struct gv_plex *p, *p2; + struct gv_sd *s, *s2; + struct gv_drive *d, *d2; + struct gv_freelist *fl, *fl2; + + mtx_lock(&sc->config_mtx); + LIST_FOREACH_SAFE(v, &sc->volumes, volume, v2) { + LIST_REMOVE(v, volume); + g_free(v->wqueue); + g_free(v); + } + LIST_FOREACH_SAFE(p, &sc->plexes, plex, p2) { + LIST_REMOVE(p, plex); g_free(p->bqueue); + g_free(p->rqueue); g_free(p->wqueue); - p->bqueue = NULL; - p->wqueue = NULL; - mtx_destroy(&p->bqueue_mtx); + g_free(p); + } + LIST_FOREACH_SAFE(s, &sc->subdisks, sd, s2) { + LIST_REMOVE(s, sd); + g_free(s); + } + LIST_FOREACH_SAFE(d, &sc->drives, drive, d2) { + LIST_FOREACH_SAFE(fl, &d->freelist, freelist, fl2) { + LIST_REMOVE(fl, freelist); + g_free(fl); + } + LIST_REMOVE(d, drive); + g_free(d->hdr); + g_free(d); } + mtx_destroy(&sc->config_mtx); } -void -gv_kill_vol_thread(struct gv_volume *v) +/* General 'attach' routine. */ +int +gv_attach_plex(struct gv_plex *p, struct gv_volume *v, int rename) { - if (v->flags & GV_VOL_THREAD_ACTIVE) { - v->flags |= GV_VOL_THREAD_DIE; - wakeup(v); - while (!(v->flags & GV_VOL_THREAD_DEAD)) - tsleep(v, PRIBIO, "gv_die", hz); - v->flags &= ~GV_VOL_THREAD_ACTIVE; - v->flags &= ~GV_VOL_THREAD_DIE; - v->flags &= ~GV_VOL_THREAD_DEAD; - g_free(v->bqueue); - v->bqueue = NULL; - mtx_destroy(&v->bqueue_mtx); + struct gv_sd *s; + + g_topology_assert(); + + if (p->vol_sc != NULL) { + printf("VINUM: plex %s already attached", p->name); + return (GV_ERR_ISATTACHED); } + + /* Stale all subdisks of this plex. */ + LIST_FOREACH(s, &p->subdisks, in_plex) { + if (s->state != GV_SD_STALE) + gv_set_sd_state(s, GV_SD_STALE, GV_SETSTATE_FORCE); + } + /* Attach to volume. Make sure volume is not up and running. */ + if (gv_provider_is_open(v->provider)) { + printf("VINUM: volume %s is busy, cannot attach %s\n", v->name, + p->name); + return (GV_ERR_ISBUSY); + } + p->vol_sc = v; + strlcpy(p->volume, v->name, GV_MAXVOLNAME); + v->plexcount++; + if (rename) { + /* XXX: Check if taken?. */ + snprintf(p->name, GV_MAXPLEXNAME, "%s.p%d", v->name, + v->plexcount - 1); + /* XXX: Rename subdisks? Original vinum does not. */ +/* LIST_FOREACH(s, &p->subdisks, in_plex) + strlcpy(s->plex, newplexname, GV_MAXPLEXNAME);*/ + } + LIST_INSERT_HEAD(&v->plexes, p, in_volume); + + /* Get plex up again. */ + gv_update_vol_size(v, gv_vol_size(v)); + gv_set_plex_state(p, GV_PLEX_UP, 0); + gv_save_config(p->vinumconf); + return (0); +} + +int +gv_attach_sd(struct gv_sd *s, struct gv_plex *p, off_t offset, int rename) +{ + struct gv_sd *s2; + int error, sdcount; + + g_topology_assert(); + + /* If subdisk is attached, don't do it. */ + if (s->plex_sc != NULL) { + printf("VINUM: subdisk %s already attached", s->name); + return (GV_ERR_ISATTACHED); + } + + gv_set_sd_state(s, GV_SD_STALE, GV_SETSTATE_FORCE); + /* First check that this subdisk has a correct offset. If none other + * starts at the same, and it's correct module stripesize, it is */ + if (offset != -1 && offset % p->stripesize != 0) + return (GV_ERR_BADOFFSET); + LIST_FOREACH(s2, &p->subdisks, in_plex) { + if (s2->plex_offset == offset) + return (GV_ERR_BADOFFSET); + } + + /* Attach the subdisk to the plex at given offset. */ + s->plex_offset = offset; + strlcpy(s->plex, p->name, GV_MAXPLEXNAME); + + sdcount = p->sdcount; + error = gv_sd_to_plex(s, p); + if (error) + return (error); + gv_update_plex_config(p); + + if (rename) { + snprintf(s->name, GV_MAXSDNAME, "%s.s%d", s->plex, + p->sdcount - 1); + } + if (p->vol_sc != NULL) + gv_update_vol_size(p->vol_sc, gv_vol_size(p->vol_sc)); + gv_save_config(p->vinumconf); + /* We don't update the subdisk state since the user might have to + * initiate a rebuild/sync first. */ + return (0); +} + +/* Detach a plex from a volume. */ +int +gv_detach_plex(struct gv_plex *p, int flags) +{ + struct gv_volume *v; + + g_topology_assert(); + v = p->vol_sc; + + if (v == NULL) { + printf("VINUM: plex %s already detached\n", p->name); + return (0); /* Not an error. */ + } + + /* + * Only proceed if forced or volume inactive. + * XXX: Safe dropout if we're mirrored. + */ + if (!(flags & GV_FLAG_F) && (gv_provider_is_open(v->provider) || + p->state == GV_PLEX_UP)) { + printf("VINUM: volume busy\n"); + return (GV_ERR_ISBUSY); + } + v->plexcount--; + /* Make sure someone don't read us when gone. */ + v->last_read_plex = NULL; + LIST_REMOVE(p, in_volume); + p->vol_sc = NULL; + memset(p->volume, 0, GV_MAXVOLNAME); + gv_update_vol_size(v, gv_vol_size(v)); + gv_save_config(p->vinumconf); + return (0); +} + +/* Detach a subdisk from a plex. */ +int +gv_detach_sd(struct gv_sd *s, int flags) +{ + struct gv_plex *p; + + g_topology_assert(); + p = s->plex_sc; + + if (p == NULL) { + printf("VINUM: subdisk %s already detached\n", s->name); + return (0); /* Not an error. */ + } + + /* + * Don't proceed if we're not forcing, and the plex is up, or degraded + * with this subdisk up. + */ + if (!(flags & GV_FLAG_F) && ((p->state != GV_PLEX_DOWN) || + ((p->state == GV_PLEX_DEGRADED) && (s->state == GV_SD_UP)))) + return (GV_ERR_ISBUSY); + + LIST_REMOVE(s, in_plex); + s->plex_sc = NULL; + memset(s->plex, 0, GV_MAXPLEXNAME); + p->sddetached++; + gv_save_config(s->vinumconf); + return (0); } Index: sys/geom/vinum/geom_vinum_var.h =================================================================== RCS file: /srv/ncvs/src/sys/geom/vinum/geom_vinum_var.h,v retrieving revision 1.11 diff -u -u -r1.11 geom_vinum_var.h --- sys/geom/vinum/geom_vinum_var.h 6 Jan 2006 18:03:17 -0000 1.11 +++ sys/geom/vinum/geom_vinum_var.h 3 Nov 2007 02:40:17 -0000 @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2004 Lukas Ertl + * Copyright (c) 2004, 2007 Lukas Ertl * Copyright (c) 1997, 1998, 1999 * Nan Yang Computer Services Limited. All rights reserved. * @@ -112,12 +112,28 @@ #define GV_BIO_MALLOC 0x02 #define GV_BIO_ONHOLD 0x04 #define GV_BIO_SYNCREQ 0x08 -#define GV_BIO_SUCCEED 0x10 +#define GV_BIO_INIT 0x10 #define GV_BIO_REBUILD 0x20 #define GV_BIO_CHECK 0x40 #define GV_BIO_PARITY 0x80 #define GV_BIO_RETRY 0x100 +/* Error codes to be used within gvinum. */ +#define GV_ERR_SETSTATE (-1) /* Error setting state. */ +#define GV_ERR_BADSIZE (-2) /* Object has wrong size. */ +#define GV_ERR_INVTYPE (-3) /* Invalid object type. */ +#define GV_ERR_CREATE (-4) /* Error creating gvinum object. */ +#define GV_ERR_ISBUSY (-5) /* Object is busy. */ +#define GV_ERR_ISATTACHED (-6) /* Object is attached to another. */ +#define GV_ERR_INVFLAG (-7) /* Invalid flag passed. */ +#define GV_ERR_INVSTATE (-8) /* Invalid state. */ +#define GV_ERR_NOTFOUND (-9) /* Object not found. */ +#define GV_ERR_NAMETAKEN (-10) /* Object name is taken. */ +#define GV_ERR_NOSPACE (-11) /* No space left on drive/subdisk. */ +#define GV_ERR_BADOFFSET (-12) /* Invalid offset specified. */ +#define GV_ERR_INVNAME (-13) /* Invalid object name. */ +#define GV_ERR_PLEXORG (-14) /* Invalid plex organization. */ + /* * hostname is 256 bytes long, but we don't need to shlep multiple copies in * vinum. We use the host name just to identify this system, and 32 bytes @@ -160,16 +176,65 @@ TAILQ_ENTRY(gv_bioq) queue; }; +#define GV_EVENT_DRIVE_TASTED 1 +#define GV_EVENT_DRIVE_LOST 2 +#define GV_EVENT_THREAD_EXIT 3 +#define GV_EVENT_CREATE_DRIVE 4 +#define GV_EVENT_CREATE_VOLUME 5 +#define GV_EVENT_CREATE_PLEX 6 +#define GV_EVENT_CREATE_SD 7 +#define GV_EVENT_SAVE_CONFIG 8 +#define GV_EVENT_RM_VOLUME 9 +#define GV_EVENT_RM_PLEX 10 +#define GV_EVENT_RM_SD 11 +#define GV_EVENT_RM_DRIVE 12 +#define GV_EVENT_SET_SD_STATE 13 +#define GV_EVENT_SET_DRIVE_STATE 14 +#define GV_EVENT_SET_VOL_STATE 15 +#define GV_EVENT_SET_PLEX_STATE 16 +#define GV_EVENT_RESET_CONFIG 17 +#define GV_EVENT_PARITY_REBUILD 18 +#define GV_EVENT_PARITY_CHECK 19 +#define GV_EVENT_START_PLEX 20 +#define GV_EVENT_START_VOLUME 21 +#define GV_EVENT_ATTACH_PLEX 22 +#define GV_EVENT_ATTACH_SD 23 +#define GV_EVENT_DETACH_PLEX 24 +#define GV_EVENT_DETACH_SD 25 +#define GV_EVENT_RENAME_VOL 26 +#define GV_EVENT_RENAME_PLEX 27 +#define GV_EVENT_RENAME_SD 28 +#define GV_EVENT_RENAME_DRIVE 29 +#define GV_EVENT_MOVE_SD 30 +#define GV_EVENT_SETUP_OBJECTS 31 + +#ifdef _KERNEL +struct gv_event { + int type; + void *arg1; + void *arg2; + intmax_t arg3; + intmax_t arg4; + TAILQ_ENTRY(gv_event) events; +}; +#endif + /* This struct contains the main vinum config. */ struct gv_softc { - /*struct mtx config_mtx; XXX not yet */ - /* Linked lists of all objects in our setup. */ LIST_HEAD(,gv_drive) drives; /* All drives. */ LIST_HEAD(,gv_plex) plexes; /* All plexes. */ LIST_HEAD(,gv_sd) subdisks; /* All subdisks. */ LIST_HEAD(,gv_volume) volumes; /* All volumes. */ + TAILQ_HEAD(,gv_event) equeue; /* Event queue. */ + struct mtx queue_mtx; /* Queue lock. */ + struct mtx config_mtx; /* Configuration lock. */ +#ifdef _KERNEL + struct bio_queue_head *bqueue; /* BIO queue. */ +#else + char *padding; +#endif struct g_geom *geom; /* Pointer to our VINUM geom. */ }; @@ -186,26 +251,19 @@ int sdcount; /* Number of subdisks. */ int flags; -#define GV_DRIVE_THREAD_ACTIVE 0x01 /* Drive has an active worker thread. */ -#define GV_DRIVE_THREAD_DIE 0x02 /* Signal the worker thread to die. */ -#define GV_DRIVE_THREAD_DEAD 0x04 /* The worker thread has died. */ -#define GV_DRIVE_NEWBORN 0x08 /* The drive was just created. */ +#define GV_DRIVE_REFERENCED 0x01 /* The drive isn't really existing, + but was referenced by a subdisk + during taste. */ + + struct gv_hdr *hdr; /* The drive header. */ - struct gv_hdr *hdr; /* The drive header. */ + struct g_consumer *consumer; /* Consumer attached to this drive. */ int freelist_entries; /* Count of freelist entries. */ LIST_HEAD(,gv_freelist) freelist; /* List of freelist entries. */ LIST_HEAD(,gv_sd) subdisks; /* Subdisks on this drive. */ LIST_ENTRY(gv_drive) drive; /* Entry in the vinum config. */ -#ifdef _KERNEL - struct bio_queue_head *bqueue; /* BIO queue of this drive. */ -#else - char *padding; -#endif - struct mtx bqueue_mtx; /* Mtx. to protect the queue. */ - - struct g_geom *geom; /* The geom of this drive. */ struct gv_softc *vinumconf; /* Pointer to the vinum conf. */ }; @@ -228,8 +286,10 @@ int init_error; /* Flag error on initialization. */ int flags; -#define GV_SD_NEWBORN 0x01 /* Subdisk was just created. */ -#define GV_SD_INITCANCEL 0x02 /* Cancel initialization process. */ +#define GV_SD_NEWBORN 0x01 /* Subdisk is created by user. */ +#define GV_SD_TASTED 0x02 /* Subdisk is created during taste. */ +#define GV_SD_CANGOUP 0x04 /* Subdisk can go up immediately. */ +#define GV_SD_GROW 0x08 /* Subdisk is added to striped plex. */ char drive[GV_MAXDRIVENAME]; /* Name of underlying drive. */ char plex[GV_MAXPLEXNAME]; /* Name of associated plex. */ @@ -237,9 +297,6 @@ struct gv_drive *drive_sc; /* Pointer to underlying drive. */ struct gv_plex *plex_sc; /* Pointer to associated plex. */ - struct g_provider *provider; /* The provider this sd represents. */ - struct g_consumer *consumer; /* Consumer attached to our provider. */ - LIST_ENTRY(gv_sd) from_drive; /* Subdisk list of underlying drive. */ LIST_ENTRY(gv_sd) in_plex; /* Subdisk list of associated plex. */ LIST_ENTRY(gv_sd) sd; /* Entry in the vinum config. */ @@ -255,7 +312,8 @@ #define GV_PLEX_DOWN 0 #define GV_PLEX_INITIALIZING 1 #define GV_PLEX_DEGRADED 2 -#define GV_PLEX_UP 3 +#define GV_PLEX_GROWABLE 3 +#define GV_PLEX_UP 4 int org; /* The plex organisation. */ #define GV_PLEX_DISORG 0 @@ -268,6 +326,7 @@ char volume[GV_MAXVOLNAME]; /* Name of associated volume. */ struct gv_volume *vol_sc; /* Pointer to associated volume. */ + int sddetached; /* Number of detached subdisks. */ int sdcount; /* Number of subdisks in this plex. */ int sddown; /* Number of subdisks that are down. */ int flags; @@ -277,26 +336,25 @@ #define GV_PLEX_THREAD_DIE 0x08 /* Signal the RAID5 thread to die. */ #define GV_PLEX_THREAD_DEAD 0x10 /* The RAID5 thread has died. */ #define GV_PLEX_NEWBORN 0x20 /* The plex was just created. */ +#define GV_PLEX_REBUILDING 0x40 /* The plex is rebuilding. */ +#define GV_PLEX_GROWING 0x80 /* The plex is growing. */ off_t synced; /* Count of synced bytes. */ - struct mtx bqueue_mtx; /* Lock for the BIO queue. */ -#ifdef _KERNEL - struct bio_queue_head *bqueue; /* BIO queue. */ - struct bio_queue_head *wqueue; /* Waiting BIO queue. */ -#else - char *bpad, *wpad; -#endif TAILQ_HEAD(,gv_raid5_packet) packets; /* RAID5 sub-requests. */ LIST_HEAD(,gv_sd) subdisks; /* List of attached subdisks. */ LIST_ENTRY(gv_plex) in_volume; /* Plex list of associated volume. */ LIST_ENTRY(gv_plex) plex; /* Entry in the vinum config. */ - struct g_provider *provider; /* The provider this plex represents. */ - struct g_consumer *consumer; /* Consumer attached to our provider. */ +#ifdef _KERNEL + struct bio_queue_head *bqueue; /* BIO queue. */ + struct bio_queue_head *wqueue; /* Waiting BIO queue. */ + struct bio_queue_head *rqueue; /* Rebuild waiting BIO queue. */ +#else + char *bpad, *wpad, *rpad; /* Padding for userland. */ +#endif - struct g_geom *geom; /* The geom of this plex. */ struct gv_softc *vinumconf; /* Pointer to the vinum config. */ }; @@ -313,19 +371,20 @@ #define GV_VOL_THREAD_ACTIVE 0x01 /* Volume has an active thread. */ #define GV_VOL_THREAD_DIE 0x02 /* Signal the thread to die. */ #define GV_VOL_THREAD_DEAD 0x04 /* The thread has died. */ +#define GV_VOL_NEWBORN 0x08 /* The volume was just created. */ - struct mtx bqueue_mtx; /* Lock for the BIO queue. */ -#ifdef _KERNEL - struct bio_queue_head *bqueue; /* BIO queue. */ + LIST_HEAD(,gv_plex) plexes; /* List of attached plexes. */ + LIST_ENTRY(gv_volume) volume; /* Entry in vinum config. */ + + struct g_provider *provider; /* Provider of this volume. */ + +#ifdef _KERNEL + struct bio_queue_head *wqueue; /* BIO delayed request queue. */ #else - char *padding; + char *wpad; /* Padding for userland. */ #endif - LIST_HEAD(,gv_plex) plexes; /* List of attached plexes. */ - LIST_ENTRY(gv_volume) volume; /* Entry in vinum config. */ - struct gv_plex *last_read_plex; - struct g_geom *geom; /* The geom of this volume. */ struct gv_softc *vinumconf; /* Pointer to the vinum config. */ }; Index: sys/geom/vinum/geom_vinum_volume.c =================================================================== RCS file: /srv/ncvs/src/sys/geom/vinum/geom_vinum_volume.c,v retrieving revision 1.11 diff -u -u -r1.11 geom_vinum_volume.c --- sys/geom/vinum/geom_vinum_volume.c 6 Jan 2006 18:03:17 -0000 1.11 +++ sys/geom/vinum/geom_vinum_volume.c 3 Nov 2007 02:40:17 -0000 @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2004 Lukas Ertl + * Copyright (c) 2007 Lukas Ertl * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -29,208 +29,42 @@ #include #include -#include -#include -#include -#include #include #include -#include -#include #include #include #include #include -static void gv_vol_completed_request(struct gv_volume *, struct bio *); -static void gv_vol_normal_request(struct gv_volume *, struct bio *); - -static void -gv_volume_orphan(struct g_consumer *cp) +void +gv_volume_start(struct gv_softc *sc, struct bio *bp) { struct g_geom *gp; struct gv_volume *v; - int error; - - g_topology_assert(); - gp = cp->geom; - g_trace(G_T_TOPOLOGY, "gv_volume_orphan(%s)", gp->name); - if (cp->acr != 0 || cp->acw != 0 || cp->ace != 0) - g_access(cp, -cp->acr, -cp->acw, -cp->ace); - error = cp->provider->error; - if (error == 0) - error = ENXIO; - g_detach(cp); - g_destroy_consumer(cp); - if (!LIST_EMPTY(&gp->consumer)) - return; - v = gp->softc; - if (v != NULL) { - gv_kill_vol_thread(v); - v->geom = NULL; - } - gp->softc = NULL; - g_wither_geom(gp, error); -} - -/* We end up here after the requests to our plexes are done. */ -static void -gv_volume_done(struct bio *bp) -{ - struct gv_volume *v; - - v = bp->bio_from->geom->softc; - bp->bio_cflags |= GV_BIO_DONE; - mtx_lock(&v->bqueue_mtx); - bioq_insert_tail(v->bqueue, bp); - wakeup(v); - mtx_unlock(&v->bqueue_mtx); -} - -static void -gv_volume_start(struct bio *bp) -{ - struct gv_volume *v; - - switch(bp->bio_cmd) { - case BIO_READ: - case BIO_WRITE: - case BIO_DELETE: - break; - case BIO_GETATTR: - default: - g_io_deliver(bp, EOPNOTSUPP); - return; - } + struct gv_plex *p, *lp; - v = bp->bio_to->geom->softc; - if (v->state != GV_VOL_UP) { + gp = sc->geom; + v = bp->bio_to->private; + if (v == NULL || v->state != GV_VOL_UP) { + /* printf("VINUM: no volume for provider!\n"); */ g_io_deliver(bp, ENXIO); return; } - mtx_lock(&v->bqueue_mtx); - bioq_disksort(v->bqueue, bp); - wakeup(v); - mtx_unlock(&v->bqueue_mtx); -} - -static void -gv_vol_worker(void *arg) -{ - struct bio *bp; - struct gv_volume *v; - - v = arg; - KASSERT(v != NULL, ("NULL v")); - mtx_lock(&v->bqueue_mtx); - for (;;) { - /* We were signaled to exit. */ - if (v->flags & GV_VOL_THREAD_DIE) - break; - - /* Take the first BIO from our queue. */ - bp = bioq_takefirst(v->bqueue); - if (bp == NULL) { - msleep(v, &v->bqueue_mtx, PRIBIO, "-", hz/10); - continue; - } - mtx_unlock(&v->bqueue_mtx); - - if (bp->bio_cflags & GV_BIO_DONE) - gv_vol_completed_request(v, bp); - else - gv_vol_normal_request(v, bp); - - mtx_lock(&v->bqueue_mtx); - } - mtx_unlock(&v->bqueue_mtx); - v->flags |= GV_VOL_THREAD_DEAD; - wakeup(v); - - kthread_exit(ENXIO); -} - -static void -gv_vol_completed_request(struct gv_volume *v, struct bio *bp) -{ - struct bio *pbp; - struct g_geom *gp; - struct g_consumer *cp, *cp2; - - pbp = bp->bio_parent; - - if (pbp->bio_error == 0) - pbp->bio_error = bp->bio_error; - - switch (pbp->bio_cmd) { - case BIO_READ: - if (bp->bio_error == 0) - break; - - if (pbp->bio_cflags & GV_BIO_RETRY) - break; - - /* Check if we have another plex left. */ - cp = bp->bio_from; - gp = cp->geom; - cp2 = LIST_NEXT(cp, consumer); - if (cp2 == NULL) - break; - - if (LIST_NEXT(cp2, consumer) == NULL) - pbp->bio_cflags |= GV_BIO_RETRY; - - g_destroy_bio(bp); - pbp->bio_children--; - mtx_lock(&v->bqueue_mtx); - bioq_disksort(v->bqueue, pbp); - mtx_unlock(&v->bqueue_mtx); - return; - - case BIO_WRITE: - case BIO_DELETE: - /* Remember if this write request succeeded. */ - if (bp->bio_error == 0) - pbp->bio_cflags |= GV_BIO_SUCCEED; - break; - } - - /* When the original request is finished, we deliver it. */ - pbp->bio_inbed++; - if (pbp->bio_inbed == pbp->bio_children) { - if (pbp->bio_cflags & GV_BIO_SUCCEED) - pbp->bio_error = 0; - pbp->bio_completed = bp->bio_length; - g_io_deliver(pbp, pbp->bio_error); - } - - g_destroy_bio(bp); -} - -static void -gv_vol_normal_request(struct gv_volume *v, struct bio *bp) -{ - struct bio_queue_head queue; - struct g_geom *gp; - struct gv_plex *p, *lp; - struct bio *cbp; - - gp = v->geom; - switch (bp->bio_cmd) { case BIO_READ: - cbp = g_clone_bio(bp); - if (cbp == NULL) { - g_io_deliver(bp, ENOMEM); - return; - } - cbp->bio_done = gv_volume_done; /* - * Try to find a good plex where we can send the request to. - * The plex either has to be up, or it's a degraded RAID5 plex. + * Try to find a good plex where we can send the request to, + * round-robin-style. The plex either has to be up, or it's a + * degraded RAID5 plex. Check if we have delayed requests. Put + * this request on the delayed queue if so. This makes sure that + * we don't read old values. */ + if (bioq_first(v->wqueue) != NULL) { + bioq_insert_tail(v->wqueue, bp); + break; + } lp = v->last_read_plex; if (lp == NULL) lp = LIST_FIRST(&v->plexes); @@ -245,200 +79,59 @@ p = LIST_NEXT(p, in_volume); } while (p != lp); - if (p == NULL || + if ((p == NULL) || (p->org == GV_PLEX_RAID5 && p->state < GV_PLEX_DEGRADED) || (p->org != GV_PLEX_RAID5 && p->state <= GV_PLEX_DEGRADED)) { - g_destroy_bio(cbp); - bp->bio_children--; g_io_deliver(bp, ENXIO); return; } - g_io_request(cbp, p->consumer); v->last_read_plex = p; + /* Hand it down to the plex logic. */ + gv_plex_start(p, bp); break; case BIO_WRITE: case BIO_DELETE: - bioq_init(&queue); + /* Delay write-requests if any plex is synchronizing. */ LIST_FOREACH(p, &v->plexes, in_volume) { - if (p->state < GV_PLEX_DEGRADED) - continue; - cbp = g_clone_bio(bp); - if (cbp == NULL) { - for (cbp = bioq_first(&queue); cbp != NULL; - cbp = bioq_first(&queue)) { - bioq_remove(&queue, cbp); - g_destroy_bio(cbp); - } - if (bp->bio_error == 0) - bp->bio_error = ENOMEM; - g_io_deliver(bp, bp->bio_error); + if (p->flags & GV_PLEX_SYNCING) { + bioq_insert_tail(v->wqueue, bp); return; } - bioq_insert_tail(&queue, cbp); - cbp->bio_done = gv_volume_done; - cbp->bio_caller1 = p->consumer; } - /* Fire off all sub-requests. */ - for (cbp = bioq_first(&queue); cbp != NULL; - cbp = bioq_first(&queue)) { - bioq_remove(&queue, cbp); - g_io_request(cbp, cbp->bio_caller1); - } - break; - } -} -static int -gv_volume_access(struct g_provider *pp, int dr, int dw, int de) -{ - struct g_geom *gp; - struct g_consumer *cp, *cp2; - int error; - - gp = pp->geom; - - error = ENXIO; - LIST_FOREACH(cp, &gp->consumer, consumer) { - error = g_access(cp, dr, dw, de); - if (error) { - LIST_FOREACH(cp2, &gp->consumer, consumer) { - if (cp == cp2) - break; - g_access(cp2, -dr, -dw, -de); - } - return (error); + /* Give the BIO to each plex of this volume. */ + LIST_FOREACH(p, &v->plexes, in_volume) { + if (p->state < GV_PLEX_DEGRADED) + continue; + gv_plex_start(p, bp); } + break; } - return (error); } -static struct g_geom * -gv_volume_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) +void +gv_bio_done(struct gv_softc *sc, struct bio *bp) { - struct g_geom *gp; - struct g_provider *pp2; - struct g_consumer *cp, *ocp; - struct gv_softc *sc; struct gv_volume *v; struct gv_plex *p; - int error, first; - - g_trace(G_T_TOPOLOGY, "gv_volume_taste(%s, %s)", mp->name, pp->name); - g_topology_assert(); - - /* First, find the VINUM class and its associated geom. */ - gp = find_vinum_geom(); - if (gp == NULL) - return (NULL); - - sc = gp->softc; - KASSERT(sc != NULL, ("gv_volume_taste: NULL sc")); - - gp = pp->geom; - - /* We only want to attach to plexes. */ - if (strcmp(gp->class->name, "VINUMPLEX")) - return (NULL); - - first = 0; - p = gp->softc; - - /* Let's see if the volume this plex wants is already configured. */ - v = gv_find_vol(sc, p->volume); - if (v == NULL) - return (NULL); - if (v->geom == NULL) { - gp = g_new_geomf(mp, "%s", p->volume); - gp->start = gv_volume_start; - gp->orphan = gv_volume_orphan; - gp->access = gv_volume_access; - gp->softc = v; - first++; - } else - gp = v->geom; - - /* Create bio queue, queue mutex, and worker thread, if necessary. */ - if (v->bqueue == NULL) { - v->bqueue = g_malloc(sizeof(struct bio_queue_head), - M_WAITOK | M_ZERO); - bioq_init(v->bqueue); - } - if (mtx_initialized(&v->bqueue_mtx) == 0) - mtx_init(&v->bqueue_mtx, "gv_plex", NULL, MTX_DEF); - - if (!(v->flags & GV_VOL_THREAD_ACTIVE)) { - kthread_create(gv_vol_worker, v, NULL, 0, 0, "gv_v %s", - v->name); - v->flags |= GV_VOL_THREAD_ACTIVE; - } - - /* - * Create a new consumer and attach it to the plex geom. Since this - * volume might already have a plex attached, we need to adjust the - * access counts of the new consumer. - */ - ocp = LIST_FIRST(&gp->consumer); - cp = g_new_consumer(gp); - g_attach(cp, pp); - if ((ocp != NULL) && (ocp->acr > 0 || ocp->acw > 0 || ocp->ace > 0)) { - error = g_access(cp, ocp->acr, ocp->acw, ocp->ace); - if (error) { - printf("GEOM_VINUM: failed g_access %s -> %s; " - "errno %d\n", v->name, p->name, error); - g_detach(cp); - g_destroy_consumer(cp); - if (first) - g_destroy_geom(gp); - return (NULL); - } - } - - p->consumer = cp; - - if (p->vol_sc != v) { - p->vol_sc = v; - v->plexcount++; - LIST_INSERT_HEAD(&v->plexes, p, in_volume); - } + struct gv_sd *s; - /* We need to setup a new VINUMVOLUME geom. */ - if (first) { - pp2 = g_new_providerf(gp, "gvinum/%s", v->name); - pp2->mediasize = pp->mediasize; - pp2->sectorsize = pp->sectorsize; - g_error_provider(pp2, 0); - v->size = pp2->mediasize; - v->geom = gp; - return (gp); + s = bp->bio_caller1; + KASSERT(s != NULL, ("gv_bio_done: NULL s")); + p = s->plex_sc; + KASSERT(p != NULL, ("gv_bio_done: NULL p")); + v = p->vol_sc; + KASSERT(v != NULL, ("gv_bio_done: NULL v")); + + switch (p->org) { + case GV_PLEX_CONCAT: + case GV_PLEX_STRIPED: + gv_plex_normal_done(p, bp); + break; + case GV_PLEX_RAID5: + gv_plex_raid5_done(p, bp); + break; } - - return (NULL); } - -static int -gv_volume_destroy_geom(struct gctl_req *req, struct g_class *mp, - struct g_geom *gp) -{ - struct gv_volume *v; - - g_trace(G_T_TOPOLOGY, "gv_volume_destroy_geom: %s", gp->name); - g_topology_assert(); - - v = gp->softc; - gv_kill_vol_thread(v); - g_wither_geom(gp, ENXIO); - return (0); -} - -#define VINUMVOLUME_CLASS_NAME "VINUMVOLUME" - -static struct g_class g_vinum_volume_class = { - .name = VINUMVOLUME_CLASS_NAME, - .version = G_VERSION, - .taste = gv_volume_taste, - .destroy_geom = gv_volume_destroy_geom, -}; - -DECLARE_GEOM_CLASS(g_vinum_volume_class, g_vinum_volume); Index: sys/modules/geom/geom_vinum/Makefile =================================================================== RCS file: /srv/ncvs/src/sys/modules/geom/geom_vinum/Makefile,v retrieving revision 1.4 diff -u -u -r1.4 Makefile --- sys/modules/geom/geom_vinum/Makefile 24 Nov 2005 15:11:41 -0000 1.4 +++ sys/modules/geom/geom_vinum/Makefile 3 Nov 2007 02:40:17 -0000 @@ -3,10 +3,10 @@ .PATH: ${.CURDIR}/../../../geom/vinum KMOD= geom_vinum -SRCS= geom_vinum.c geom_vinum_drive.c geom_vinum_plex.c \ +SRCS= geom_vinum.c geom_vinum_create.c geom_vinum_drive.c geom_vinum_plex.c \ geom_vinum_volume.c geom_vinum_subr.c geom_vinum_raid5.c \ geom_vinum_share.c geom_vinum_list.c geom_vinum_rm.c \ geom_vinum_init.c geom_vinum_state.c geom_vinum_rename.c \ - geom_vinum_move.c + geom_vinum_move.c geom_vinum_events.c .include Index: sbin/gvinum/gvinum.8 =================================================================== RCS file: /srv/ncvs/src/sbin/gvinum/gvinum.8,v retrieving revision 1.4 diff -u -u -r1.4 gvinum.8 --- sbin/gvinum/gvinum.8 30 Sep 2006 11:02:17 -0000 1.4 +++ sbin/gvinum/gvinum.8 3 Nov 2007 02:40:17 -0000 @@ -40,6 +40,13 @@ .Op Fl options .Sh COMMANDS .Bl -tag -width indent +.It Ic attach Ar plex volume Op Cm rename +.It Ic attach Ar subdisk plex Oo Ar offset Oc Op Cm rename +Attach a plex to a volume, or a subdisk to a plex. +If offset is specified, the subdisk will be attached to the given offset within +the plex. +If rename is specified, the subdisk or plex will change name according to the +object it attaches to. .It Ic checkparity Oo Fl f Oc Ar plex Check the parity blocks of a RAID-5 plex. The parity check will start at the @@ -49,7 +56,10 @@ the first location at which plex's parity is incorrect. All subdisks in the plex must be up for a parity check. -.It Ic create Op Ar description-file +.It Ic concat Oo Fl fv Oc Oo Fl n Ar name Oc Ar drives +Create a concatenated volume from the specified drives. +If no name is specified, a unique name will be set by gvinum. +.It Ic create Oo Fl f Oc Op Ar description-file Create a volume as described in .Ar description-file . If no @@ -57,6 +67,18 @@ provided, opens an editor and provides the current .Nm configuration for editing. +The +.Fl f +flag will make gvinum ignore any errors regarding creating objects that already +exists. +However, in contrast to vinum, objects that are not properly named in the +.Ar description-file +will not be created when the +.Fl f +flag is given. +.It Ic detach Oo Fl f Oc Op Ar plex | subdisk +Detach a plex or subdisk from the volume or plex to which it is +attached. .It Ic help Provides a synopsis of .Nm @@ -76,6 +98,14 @@ and .Fl V flags provide progressively more detailed output. +.It Ic mirror Oo Fl fsv Oc Oo Fl n Ar name Oc Ar drives +Create a mirrored volume from the specified drives. +It requires at least a multiple of 2 drives. +If no name is specified, a unique name will be set by gvinum. +If the +.Fl s +flag is specified, a striped mirror will be created, and thus requires a +multiple of 4 drives. .It Ic move | mv Fl f Ar drive subdisk Op Ar ... Move the subdisk(s) to the specified drive. The @@ -85,12 +115,19 @@ This can currently only be done when the subdisk is not being accessed. .Pp -If the subdisk(s) form part of a RAID-5 plex, the disk(s) will need to be set -to the +If a single subdisk is moved, and it forms a part of a RAID-5 plex, the moved +subdisks will need to be set to the +.Dq stale +state, and the plex will require a +.Ic start +command. +If multiple subdisk(s) is moved, and form part of a RAID-5 plex, the +moved disk(s) will need to be set to the .Dq up state and the plex will require a .Ic rebuildparity -command; if the subdisk(s) form part of a plex that is mirrored with other +command. +If the subdisk(s) form part of a plex that is mirrored with other plexes, the plex will require restarting and will sync once restarted. Moving more than one subdisk in a RAID-5 plex or subdisks from both sides of a @@ -105,6 +142,11 @@ when running in interactive mode. Normally this would be done by entering the EOF character. +.It Ic raid5 Oo Fl fv Oc Oo Fl s Ar stripesize Oc Oo Fl n Ar name Oc Ar drives +Create a RAID-5 volume from the specified drives. +If no name is specified,a unique name will be set by +.Ic gvinum. +This organization requires at least three drives. .It Ic rename Oo Fl r Oc Ar drive | subdisk | plex | volume newname Change the name of the specified object. The @@ -143,9 +185,21 @@ Read configuration from all vinum drives. .It Ic start Oo Fl S Ar size Oc Ar volume | plex | subdisk Allow the system to access the objects. +If necessary, plexes will be synced and rebuilt. +If a subdisk was added to a running RAID-5 or striped plex, gvinum will +expand into this subdisk and grow the whole RAID-5 array. +This can be done without unmounting your filesystem. The .Fl S flag is currently ignored. +.It Ic stop Oo Fl f Oc Op Ar volume | plex | subdisk +Terminate access to the objects, or stop +.Nm +if no parameters are specified. +.It Ic stripe Oo Fl fv Oc Oo Fl n Ar name Oc Ar drives +Create a striped volume from the specified drives. If no name is specified, +a unique name will be set by Ic gvinum. This organization requires at least two +drives. .El .Sh DESCRIPTION The @@ -217,15 +271,90 @@ directory with device nodes for .Nm objects -.It Pa /dev/gvinum/plex -directory containing device nodes for -.Nm -plexes -.It Pa /dev/gvinum/sd -directory containing device nodes for -.Nm -subdisks .El +.Sh EXAMPLES +To create a mirror on disks /dev/ad1 and /dev/ad2, create a filesystem, mount, +unmount and then stop Ic gvinum: +.Pp +.Dl "gvinum mirror /dev/ad1 /dev/ad2" +.Dl "newfs /dev/gvinum/gvinumvolume0" +.Dl "mount /dev/gvinum/gvinumvolume0 /mnt" +.Dl "..." +.Dl "unmount /mnt" +.Dl "gvinum stop" +.Pp +To create a striped mirror on disks /dev/ad1 /dev/ad2 /dev/ad3 and /dev/ad4 +named "data" and create a filesystem: +.Pp +.Dl "gvinum mirror -s -n data /dev/ad1 /dev/ad2 /dev/ad3 /dev/ad4" +.Dl "newfs /dev/gvinum/data" +.Pp +To create a raid5 array on disks /dev/ad1 /dev/ad2 and /dev/ad3, with stripesize +493k you can use the raid5 command: +.Pp +.Dl "gvinum raid5 -s 493k /dev/ad1 /dev/ad2 /dev/ad3" +.Pp +Then the volume will be created automatically. +Afterwards, you have to initialize the volume: +.Pp +.Dl "gvinum start myraid5vol" +.Pp +The initialization will start, and the states will be updated when it's +finished. +The list command will give you information about its progress. +.Pp +Imagine that one of the drives fails, and the output of 'printconfig' looks +something like this: +.Pp +.Dl "drive gvinumdrive1 device /dev/ad2" +.Dl "drive gvinumdrive2 device /dev/???" +.Dl "drive gvinumdrive0 device /dev/ad1" +.Dl "volume myraid5vol" +.Dl "plex name myraid5vol.p0 org raid5 986s vol myraid5vol" +.Dl "sd name myraid5vol.p0.s2 drive gvinumdrive2 len 32538s driveoffset 265s" +.Dl "plex myraid5vol.p0 plexoffset 1972s" +.Dl "sd name myraid5vol.p0.s1 drive gvinumdrive1 len 32538s driveoffset 265s" +.Dl "plex myraid5vol.p0 plexoffset 986s" +.Dl "sd name myraid5vol.p0.s0 drive gvinumdrive0 len 32538s driveoffset 265s" +.Dl "plex myraid5vol.p0 plexoffset 0s" +.Pp +Create a new drive with this configuration: +.Pp +.Dl "drive gdrive4 device /dev/ad4" +.Pp +Then move the stale subdisk to the new drive: +.Pp +.Dl "gvinum move gdrive4 myraid5vol.p0.s2" +.Pp +Then, initiate the rebuild: +.Pp +.Dl "gvinum start myraid5vol.p0" +.Pp +The plex will go up form degraded mode after the rebuild is finished. +The plex can still be used while the rebuild is in progress, although requests +might be delayed. +For a more advanced usage and detailed explanation of gvinum, the +handbook is recommended. +.Pp +Given the configuration as in the previous example, growing a RAID-5 or STRIPED +array is accomplished by adding a new subdisk to the plex with a +.Ar description-file +similar to this: +.Pp +.Dl "drive newdrive device /dev/ad4" +.Dl "sd drive newdrive plex myraid5vol.p0" +.Pp +If everything went ok, the plex state should now be set to growable. +You can then start the growing with the +.Ic start +command: +.Pp +.Dl "gvinum start myraid5vol.p0" +.Pp +As with rebuilding, you can watch the progress using the +.Ic list +command. +.Pp .Sh SEE ALSO .Xr geom 4 , .Xr geom 8 @@ -255,9 +384,13 @@ .An "Chris Jones" through the 2005 Google Summer of Code program. +.Ic a partial rewrite of gvinum was done by "Lukas Ertl" and "Ulf Lilleengen" +through the 2007 Google Summer of Code program. +The documentation have been updated to reflect the new functionality. .Sh AUTHORS .An Lukas Ertl Aq le@FreeBSD.org .An Chris Jones Aq soc-cjones@FreeBSD.org +.An Ulf Lilleengen Aq lulf@FreeBSD.org .Sh BUGS Currently, .Nm @@ -271,10 +404,6 @@ .Ic start is ignored. .Pp -The -.Ic stop -command does not work. -.Pp Moving subdisks that are not part of a mirrored or RAID-5 volume will destroy data. It is perhaps a bug to permit this. @@ -291,18 +420,10 @@ .Xr vinum 4 are not supported: .Bl -tag -width indent -.It Ic attach Ar plex volume Op Cm rename -.It Ic attach Ar subdisk plex Oo Ar offset Oc Op Cm rename -Attach a plex to a volume, or a subdisk to a plex. -.It Ic concat Oo Fl fv Oc Oo Fl n Ar name Oc Ar drives -Create a concatenated volume from the specified drives. .It Ic debug Cause the volume manager to enter the kernel debugger. .It Ic debug Ar flags Set debugging flags. -.It Ic detach Oo Fl f Oc Op Ar plex | subdisk -Detach a plex or subdisk from the volume or plex to which it is -attached. .It Ic dumpconfig Op Ar drive ... List the configuration information stored on the specified drives, or all drives in the system if no drive names are specified. @@ -310,17 +431,9 @@ List information about volume manager state. .It Ic label Ar volume Create a volume label. -.It Ic mirror Oo Fl fsv Oc Oo Fl n Ar name Oc Ar drives -Create a mirrored volume from the specified drives. .It Ic resetstats Oo Fl r Oc Op Ar volume | plex | subdisk Reset statistics counters for the specified objects, or for all objects if none are specified. .It Ic setdaemon Op Ar value Set daemon configuration. -.It Ic stop Oo Fl f Oc Op Ar volume | plex | subdisk -Terminate access to the objects, or stop -.Nm -if no parameters are specified. -.It Ic stripe Oo Fl fv Oc Oo Fl n Ar name Oc Ar drives -Create a striped volume from the specified drives. .El Index: sbin/gvinum/gvinum.c =================================================================== RCS file: /srv/ncvs/src/sbin/gvinum/gvinum.c,v retrieving revision 1.8 diff -u -u -r1.8 gvinum.c --- sbin/gvinum/gvinum.c 23 Mar 2006 19:58:43 -0000 1.8 +++ sbin/gvinum/gvinum.c 3 Nov 2007 02:40:17 -0000 @@ -1,5 +1,7 @@ /* - * Copyright (c) 2004 Lukas Ertl, 2005 Chris Jones + * Copyright (c) 2004 Lukas Ertl + * Copyright (c) 2005 Chris Jones + * Copyright (c) 2007 Ulf Lilleengen * All rights reserved. * * Portions of this software were developed for the FreeBSD Project @@ -43,6 +45,7 @@ #include #include +#include #include #include #include @@ -54,12 +57,17 @@ #include "gvinum.h" +void gvinum_attach(int, char **); +void gvinum_concat(int, char **); void gvinum_create(int, char **); +void gvinum_detach(int, char **); void gvinum_help(void); void gvinum_list(int, char **); void gvinum_move(int, char **); +void gvinum_mirror(int, char **); void gvinum_parityop(int, char **, int); void gvinum_printconfig(int, char **); +void gvinum_raid5(int, char **); void gvinum_rename(int, char **); void gvinum_resetconfig(void); void gvinum_rm(int, char **); @@ -67,9 +75,15 @@ void gvinum_setstate(int, char **); void gvinum_start(int, char **); void gvinum_stop(int, char **); +void gvinum_stripe(int, char **); void parseline(int, char **); void printconfig(FILE *, char *); +char *create_drive(char *); +void create_volume(int, char **, char *); +char *find_name(const char *, int, int); +char *find_pattern(char *, char *); + int main(int argc, char **argv) { @@ -111,6 +125,44 @@ exit(0); } +/* Attach a plex to a volume or a subdisk to a plex. */ +void +gvinum_attach(int argc, char **argv) +{ + struct gctl_req *req; + const char *errstr; + int rename; + off_t offset; + + rename = 0; + offset = -1; + if (argc < 3) { + warnx("usage:\tattach [rename] " + "[]\n" + "\tattach [rename]"); + return; + } + if (argc > 3) { + if (!strcmp(argv[3], "rename")) { + rename = 1; + if (argc == 5) + offset = strtol(argv[4], NULL, 0); + } else + offset = strtol(argv[3], NULL, 0); + } + req = gctl_get_handle(); + gctl_ro_param(req, "class", -1, "VINUM"); + gctl_ro_param(req, "verb", -1, "attach"); + gctl_ro_param(req, "child", -1, argv[1]); + gctl_ro_param(req, "parent", -1, argv[2]); + gctl_ro_param(req, "offset", sizeof(off_t), &offset); + gctl_ro_param(req, "rename", sizeof(int), &rename); + errstr = gctl_issue(req); + if (errstr != NULL) + warnx("attach failed: %s", errstr); + gctl_free(req); +} + void gvinum_create(int argc, char **argv) { @@ -120,19 +172,30 @@ struct gv_sd *s; struct gv_volume *v; FILE *tmp; - int drives, errors, fd, line, plexes, plex_in_volume; - int sd_in_plex, status, subdisks, tokens, volumes; + int drives, errors, fd, flags, i, line, plexes, plex_in_volume; + int sd_in_plex, status, subdisks, tokens, undeffd, volumes; const char *errstr; - char buf[BUFSIZ], buf1[BUFSIZ], commandline[BUFSIZ], *ed; + char buf[BUFSIZ], buf1[BUFSIZ], commandline[BUFSIZ], *ed, *sdname; char original[BUFSIZ], tmpfile[20], *token[GV_MAXARGS]; char plex[GV_MAXPLEXNAME], volume[GV_MAXVOLNAME]; - if (argc == 2) { - if ((tmp = fopen(argv[1], "r")) == NULL) { - warn("can't open '%s' for reading", argv[1]); - return; - } - } else { + tmp = NULL; + flags = 0; + for (i = 1; i < argc; i++) { + /* Force flag used to ignore already created drives. */ + if (!strcmp(argv[i], "-f")) { + flags |= GV_FLAG_F; + /* Else it must be a file. */ + } else { + if ((tmp = fopen(argv[1], "r")) == NULL) { + warn("can't open '%s' for reading", argv[1]); + return; + } + } + } + + /* We didn't get a file. */ + if (tmp == NULL) { snprintf(tmpfile, sizeof(tmpfile), "/tmp/gvinum.XXXXXX"); if ((fd = mkstemp(tmpfile)) == -1) { @@ -167,9 +230,11 @@ req = gctl_get_handle(); gctl_ro_param(req, "class", -1, "VINUM"); gctl_ro_param(req, "verb", -1, "create"); + gctl_ro_param(req, "flags", sizeof(int), &flags); drives = volumes = plexes = subdisks = 0; - plex_in_volume = sd_in_plex = 0; + plex_in_volume = sd_in_plex = undeffd = 0; + plex[0] = '\0'; errors = 0; line = 1; while ((fgets(buf, BUFSIZ, tmp)) != NULL) { @@ -270,8 +335,16 @@ /* Default name. */ if (strlen(s->name) == 0) { - snprintf(s->name, GV_MAXSDNAME, "%s.s%d", - plex, sd_in_plex++); + if (strlen(plex) == 0) { + sdname = find_name("gvinumsubdisk.p", + GV_TYPE_SD, GV_MAXSDNAME); + snprintf(s->name, GV_MAXSDNAME, + "%s.s%d", sdname, undeffd++); + free(sdname); + } else { + snprintf(s->name, GV_MAXSDNAME, + "%s.s%d",plex, sd_in_plex++); + } } /* Default plex. */ @@ -320,7 +393,279 @@ warnx("create failed: %s", errstr); } gctl_free(req); - gvinum_list(0, NULL); +} + +/* Create a concatenated volume. */ +void +gvinum_concat(int argc, char **argv) +{ + + if (argc < 2) { + warnx("usage:\tconcat [-fv] [-n name] drives\n"); + return; + } + create_volume(argc, argv, "concat"); +} + + +/* Create a drive quick and dirty. */ +char * +create_drive(char *device) +{ + struct gv_drive *d; + struct gctl_req *req; + const char *errstr; + char *drivename, *dname; + int drives, i, flags, volumes, subdisks, plexes; + + flags = plexes = subdisks = volumes = 0; + drives = 1; + dname = NULL; + + /* Strip away eventual /dev/ in front. */ + if (strncmp(device, "/dev/", 5) == 0) + device += 5; + + drivename = find_name("gvinumdrive", GV_TYPE_DRIVE, GV_MAXDRIVENAME); + if (drivename == NULL) + return (NULL); + + req = gctl_get_handle(); + gctl_ro_param(req, "class", -1, "VINUM"); + gctl_ro_param(req, "verb", -1, "create"); + d = malloc(sizeof(struct gv_drive)); + if (d == NULL) + err(1, "unable to allocate for gv_drive object"); + memset(d, 0, sizeof(struct gv_drive)); + + strlcpy(d->name, drivename, GV_MAXDRIVENAME); + strlcpy(d->device, device, GV_MAXDRIVENAME); + gctl_ro_param(req, "drive0", sizeof(*d), d); + gctl_ro_param(req, "flags", sizeof(int), &flags); + gctl_ro_param(req, "drives", sizeof(int), &drives); + gctl_ro_param(req, "volumes", sizeof(int), &volumes); + gctl_ro_param(req, "plexes", sizeof(int), &plexes); + gctl_ro_param(req, "subdisks", sizeof(int), &subdisks); + errstr = gctl_issue(req); + if (errstr != NULL) { + warnx("error creating drive: %s", errstr); + gctl_free(req); + return (NULL); + } else { + gctl_free(req); + /* XXX: This is needed because we have to make sure the drives + * are created before we return. */ + /* Loop until it's in the config. */ + for (i = 0; i < 100000; i++) { + dname = find_name("gvinumdrive", GV_TYPE_DRIVE, + GV_MAXDRIVENAME); + /* If we got a different name, quit. */ + if (dname == NULL) + continue; + if (strcmp(dname, drivename)) { + free(dname); + return (drivename); + } + free(dname); + dname = NULL; + usleep(100000); /* Sleep for 0.1s */ + } + } + gctl_free(req); + return (drivename); +} + +/* + * General routine for creating a volume. Mainly for use by concat, mirror, + * raid5 and stripe commands. + */ +void +create_volume(int argc, char **argv, char *verb) +{ + struct gctl_req *req; + const char *errstr; + char buf[BUFSIZ], *drivename, *volname; + int drives, flags, i; + off_t stripesize; + + flags = 0; + drives = 0; + volname = NULL; + stripesize = 262144; + + /* XXX: Should we check for argument length? */ + + req = gctl_get_handle(); + gctl_ro_param(req, "class", -1, "VINUM"); + + for (i = 1; i < argc; i++) { + if (!strcmp(argv[i], "-f")) { + flags |= GV_FLAG_F; + } else if (!strcmp(argv[i], "-n")) { + volname = argv[++i]; + } else if (!strcmp(argv[i], "-v")) { + flags |= GV_FLAG_V; + } else if (!strcmp(argv[i], "-s")) { + flags |= GV_FLAG_S; + if (!strcmp(verb, "raid5")) + stripesize = gv_sizespec(argv[++i]); + } else { + /* Assume it's a drive. */ + snprintf(buf, sizeof(buf), "drive%d", drives++); + + /* First we create the drive. */ + drivename = create_drive(argv[i]); + if (drivename == NULL) + goto bad; + /* Then we add it to the request. */ + gctl_ro_param(req, buf, -1, drivename); + } + } + + gctl_ro_param(req, "stripesize", sizeof(off_t), &stripesize); + + /* Find a free volume name. */ + if (volname == NULL) + volname = find_name("gvinumvolume", GV_TYPE_VOL, GV_MAXVOLNAME); + + /* Then we send a request to actually create the volumes. */ + gctl_ro_param(req, "verb", -1, verb); + gctl_ro_param(req, "flags", sizeof(int), &flags); + gctl_ro_param(req, "drives", sizeof(int), &drives); + gctl_ro_param(req, "name", -1, volname); + errstr = gctl_issue(req); + if (errstr != NULL) + warnx("creating %s volume failed: %s", verb, errstr); +bad: + gctl_free(req); +} + +/* Parse a line of the config, return the word after . */ +char * +find_pattern(char *line, char *pattern) +{ + char *ptr; + + ptr = strsep(&line, " "); + while (ptr != NULL) { + if (!strcmp(ptr, pattern)) { + /* Return the next. */ + ptr = strsep(&line, " "); + return (ptr); + } + ptr = strsep(&line, " "); + } + return (NULL); +} + +/* Find a free name for an object given a a prefix. */ +char * +find_name(const char *prefix, int type, int namelen) +{ + struct gctl_req *req; + char comment[1], buf[GV_CFG_LEN - 1], *name, *sname, *ptr; + const char *errstr; + int i, n, begin, len, conflict; + char line[1024]; + + comment[0] = '\0'; + + /* Find a name. Fetch out configuration first. */ + req = gctl_get_handle(); + gctl_ro_param(req, "class", -1, "VINUM"); + gctl_ro_param(req, "verb", -1, "getconfig"); + gctl_ro_param(req, "comment", -1, comment); + gctl_rw_param(req, "config", sizeof(buf), buf); + errstr = gctl_issue(req); + if (errstr != NULL) { + warnx("can't get configuration: %s", errstr); + return (NULL); + } + gctl_free(req); + + begin = 0; + len = strlen(buf); + i = 0; + sname = malloc(namelen + 1); + + /* XXX: Max object setting? */ + for (n = 0; n < 10000; n++) { + snprintf(sname, namelen, "%s%d", prefix, n); + conflict = 0; + begin = 0; + /* Loop through the configuration line by line. */ + for (i = 0; i < len; i++) { + if (buf[i] == '\n' || buf[i] == '\0') { + ptr = buf + begin; + strlcpy(line, ptr, (i - begin) + 1); + begin = i + 1; + switch (type) { + case GV_TYPE_DRIVE: + name = find_pattern(line, "drive"); + break; + case GV_TYPE_VOL: + name = find_pattern(line, "volume"); + break; + case GV_TYPE_PLEX: + case GV_TYPE_SD: + name = find_pattern(line, "name"); + break; + default: + printf("Invalid type given\n"); + continue; + } + if (name == NULL) + continue; + if (!strcmp(sname, name)) { + conflict = 1; + /* XXX: Could quit the loop earlier. */ + } + } + } + if (!conflict) + return (sname); + } + free(sname); + return (NULL); +} + +/* Detach a plex or subdisk from its parent. */ +void +gvinum_detach(int argc, char **argv) +{ + const char *errstr; + struct gctl_req *req; + int flags, i; + + optreset = 1; + optind = 1; + while ((i = getopt(argc, argv, "f")) != -1) { + switch(i) { + case 'f': + flags |= GV_FLAG_F; + break; + default: + warn("invalid flag: %c", i); + return; + } + } + argc -= optind; + argv += optind; + if (argc != 1) { + warnx("usage: detach [-f] | "); + return; + } + + req = gctl_get_handle(); + gctl_ro_param(req, "class", -1, "VINUM"); + gctl_ro_param(req, "verb", -1, "detach"); + gctl_ro_param(req, "object", -1, argv[0]); + gctl_ro_param(req, "flags", sizeof(int), &flags); + + errstr = gctl_issue(req); + if (errstr != NULL) + warnx("detach failed: %s", errstr); + gctl_free(req); } void @@ -329,8 +674,16 @@ printf("COMMANDS\n" "checkparity [-f] plex\n" " Check the parity blocks of a RAID-5 plex.\n" - "create description-file\n" + "create [-f] description-file\n" " Create as per description-file or open editor.\n" + "attach plex volume [rename]\n" + "attach subdisk plex [offset] [rename]\n" + " Attach a plex to a volume, or a subdisk to a plex\n" + "concat [-fv] [-n name] drives\n" + " Create a concatenated volume from the specified drives.\n" + "detach [-f] [plex | subdisk]\n" + " Detach a plex or a subdisk from the volume or plex to\n" + " which it is attached.\n" "l | list [-r] [-v] [-V] [volume | plex | subdisk]\n" " List information about specified objects.\n" "ld [-r] [-v] [-V] [volume]\n" @@ -341,11 +694,15 @@ " List information about plexes.\n" "lv [-r] [-v] [-V] [volume]\n" " List information about volumes.\n" + "mirror [-fsv] [-n name] drives\n" + " Create a mirrored volume from the specified drives.\n" "move | mv -f drive object ...\n" " Move the object(s) to the specified drive.\n" "quit Exit the vinum program when running in interactive mode." " Nor-\n" " mally this would be done by entering the EOF character.\n" + "raid5 [-fv] [-s stripesize] [-n name] drives\n" + " Create a RAID-5 volume from the specified drives.\n" "rename [-r] [drive | subdisk | plex | volume] newname\n" " Change the name of the specified object.\n" "rebuildparity plex [-f]\n" @@ -363,6 +720,8 @@ " poses only.\n" "start [-S size] volume | plex | subdisk\n" " Allow the system to access the objects.\n" + "stripe [-fv] [-n name] drives\n" + " Create a striped volume from the specified drives.\n" ); return; @@ -488,6 +847,18 @@ return; } +/* Create a mirrored volume. */ +void +gvinum_mirror(int argc, char **argv) +{ + + if (argc < 2) { + warnx("usage\tmirror [-fsv] [-n name] drives\n"); + return; + } + create_volume(argc, argv, "mirror"); +} + /* Note that move is currently of form '[-r] target object [...]' */ void gvinum_move(int argc, char **argv) @@ -553,8 +924,7 @@ gvinum_parityop(int argc, char **argv, int rebuild) { struct gctl_req *req; - int flags, i, rv; - off_t offset; + int flags, i; const char *errstr; char *op, *msg; @@ -591,47 +961,32 @@ return; } - do { - rv = 0; - req = gctl_get_handle(); - gctl_ro_param(req, "class", -1, "VINUM"); - gctl_ro_param(req, "verb", -1, "parityop"); - gctl_ro_param(req, "flags", sizeof(int), &flags); - gctl_ro_param(req, "rebuild", sizeof(int), &rebuild); - gctl_rw_param(req, "rv", sizeof(int), &rv); - gctl_rw_param(req, "offset", sizeof(off_t), &offset); - gctl_ro_param(req, "plex", -1, argv[0]); - errstr = gctl_issue(req); - if (errstr) { - warnx("%s\n", errstr); - gctl_free(req); - break; - } - gctl_free(req); - if (flags & GV_FLAG_V) { - printf("\r%s at %s ... ", msg, - gv_roughlength(offset, 1)); - } - if (rv == 1) { - printf("Parity incorrect at offset 0x%jx\n", - (intmax_t)offset); - if (!rebuild) - break; - } - fflush(stdout); + req = gctl_get_handle(); + gctl_ro_param(req, "class", -1, "VINUM"); + gctl_ro_param(req, "verb", -1, op); + gctl_ro_param(req, "rebuild", sizeof(int), &rebuild); + gctl_ro_param(req, "flags", sizeof(int), &flags); + gctl_ro_param(req, "plex", -1, argv[0]); + + errstr = gctl_issue(req); + if (errstr) + warnx("%s\n", errstr); + gctl_free(req); +} + +/* Create a RAID-5 volume. */ +void +gvinum_raid5(int argc, char **argv) +{ - /* Clear the -f flag. */ - flags &= ~GV_FLAG_F; - } while (rv >= 0); - - if ((rv == 2) && (flags & GV_FLAG_V)) { - if (rebuild) - printf("Rebuilt parity on %s\n", argv[0]); - else - printf("%s has correct parity\n", argv[0]); + if (argc < 2) { + warnx("usage:\traid5 [-fv] [-s stripesize] [-n name] drives\n"); + return; } + create_volume(argc, argv, "raid5"); } + void gvinum_rename(int argc, char **argv) { @@ -728,7 +1083,6 @@ return; } gctl_free(req); - gvinum_list(0, NULL); } void @@ -763,7 +1117,6 @@ return; } gctl_free(req); - gvinum_list(0, NULL); printf("gvinum configuration obliterated\n"); } @@ -833,28 +1186,53 @@ } gctl_free(req); - gvinum_list(0, NULL); } void gvinum_stop(int argc, char **argv) { - int fileid; + int err, fileid; fileid = kldfind(GVINUMMOD); if (fileid == -1) { warn("cannot find " GVINUMMOD); return; } - if (kldunload(fileid) != 0) { + + /* + * This little hack prevents that we end up in an infinite loop in + * g_unload_class(). gv_unload() will return EAGAIN so that the GEOM + * event thread will be free for the g_wither_geom() call from + * gv_unload(). It's silly, but it works. + */ + printf("unloading " GVINUMMOD " kernel module... "); + fflush(stdout); + if ((err = kldunload(fileid)) != 0 && (errno == EAGAIN)) { + sleep(1); + err = kldunload(fileid); + } + if (err != 0) { + printf(" failed!\n"); warn("cannot unload " GVINUMMOD); return; } - warnx(GVINUMMOD " unloaded"); + printf("done\n"); exit(0); } +/* Create a striped volume. */ +void +gvinum_stripe(int argc, char **argv) +{ + + if (argc < 2) { + warnx("usage:\tstripe [-fv] [-n name] drives\n"); + return; + } + create_volume(argc, argv, "stripe"); +} + void parseline(int argc, char **argv) { @@ -865,6 +1243,12 @@ gvinum_create(argc, argv); else if (!strcmp(argv[0], "exit") || !strcmp(argv[0], "quit")) exit(0); + else if (!strcmp(argv[0], "attach")) + gvinum_attach(argc, argv); + else if (!strcmp(argv[0], "detach")) + gvinum_detach(argc, argv); + else if (!strcmp(argv[0], "concat")) + gvinum_concat(argc, argv); else if (!strcmp(argv[0], "help")) gvinum_help(); else if (!strcmp(argv[0], "list") || !strcmp(argv[0], "l")) @@ -877,12 +1261,16 @@ gvinum_list(argc, argv); else if (!strcmp(argv[0], "lv")) gvinum_list(argc, argv); + else if (!strcmp(argv[0], "mirror")) + gvinum_mirror(argc, argv); else if (!strcmp(argv[0], "move")) gvinum_move(argc, argv); else if (!strcmp(argv[0], "mv")) gvinum_move(argc, argv); else if (!strcmp(argv[0], "printconfig")) gvinum_printconfig(argc, argv); + else if (!strcmp(argv[0], "raid5")) + gvinum_raid5(argc, argv); else if (!strcmp(argv[0], "rename")) gvinum_rename(argc, argv); else if (!strcmp(argv[0], "resetconfig")) @@ -897,6 +1285,8 @@ gvinum_start(argc, argv); else if (!strcmp(argv[0], "stop")) gvinum_stop(argc, argv); + else if (!strcmp(argv[0], "stripe")) + gvinum_stripe(argc, argv); else if (!strcmp(argv[0], "checkparity")) gvinum_parityop(argc, argv, 0); else if (!strcmp(argv[0], "rebuildparity"))